var urllib = require('url');
var ObjectX = require(__dirname + "/proto.js").ObjectX;
var check = require('check-types');
var _ = require("underscore");
/*
if (!check.assigned(url.match(regex_urlfilter.accept))) { //user give acceptance
console.log("hey21");
if (!config.getConfig("external_links")) {
console.log("hey");
Sync(function(){
var result = tldextract.sync(null, url);
var result1 = tldextract.sync(null, domain);
console.log(result.domain, "\t", domain);
if(result.domain === domain){
//same domain
if(result.subdomain !== result1.subdomain){
//normalize if subdomain is not present
if(result1.subdomain !== ""){
if(result.subdomain === ""){
//empty subdomain
if(check.assigned(url_obj)){
//add subdomain
url = url.replace('http://','http://' + result1.subdomain);
url_obj['url'] = url;
}
}
}
}
}else{
return false;
}
});
}
}
*/
/**
Represents URL and it's crawled details.
has parsing functions.
@author Tilak Patidar <tilakpatidar@gmail.com>
@constructor
@param {Message} message_obj
*/
var URLCreator = function(message_obj) {
var message = message_obj;
var config = message.get('config');
var regex_urlfilter = message.get('regex_urlfilter');
var log = message.get('log');
/**
Returns 'file' or 'webpage' based on URL and tika config.
@private
@param {String} url
*/
function getFileType(url) {
//tika vs normal webpage
if (config.getConfig("tika")) {
if (check.assigned(url.match(config.getConfig("tika_supported_files")))) {
//file type matched use tika instead
return "file";
} else {
//#debug#(url,domain)
return "webpage";
}
} else {
return "webpage";
}
}
/**
Extractes the domain from url.
@private
@param {String} url
*/
function extractDomain(url) {
return url.split("/").slice(0, 3).join("/");
}
/**
Normalizes protocol to http:
@param {String} url
@private
*/
function normalizeProtocol(url) {
return "http://" + url.replace("https://", "").replace("http://", "");
}
/**
Normalizes domain.
@param {String} url
@private
*/
function normalizeDomain(url) {
if (url[url.length - 1] == "/") {
url = url.slice(0, -1);
}
url = "http://" + url.replace("https://", "").replace("http://", "");
return url;
}
/**
Sorts param from the url.
@param {String} url
@private
*/
function sortedParams(url) {
var url_parts = urllib.parse(url, true);
//console.log(url, url_parts);
var sorted = "";
if (check.assigned(url_parts.query) && !check.emptyObject(url_parts.query)) {
var keys = Object.keys(url_parts.query);
//console.log(keys);
keys.sort();
sorted = [];
_.each(keys, function(item){
sorted.push(item + "=" + encodeURIComponent(url_parts.query[item]) );
});
sorted = "?" + sorted.join("&");
//console.log(sorted)
}
url = url_parts.protocol + "//" + url_parts.hostname + url_parts.pathname + sorted;
return url;
}
/**
Normalize a url.
@param {String} url
@private
*/
function normalizeURL(url) {
url = normalizeProtocol(url);
url = sortedParams(url);
if (url[url.length - 1] == "/") {
url = url.slice(0, -1);
}
a = url.split("/");
var last_part = a[a.length - 1];
last_part = last_part.replace(/#.*/gi, '').trim();
if (last_part === "") {
//if urls is like /#home it would end up being / after replace
a.pop();
} else {
a[a.length - 1] = last_part;
}
url = a.join("/");
return url;
}
/**
Returns nutch style url.
@private
@param {String} url
*/
function nutchStyleURLKey(url) {
var type = "http://";
var type1 = ":http";
var domain = url.replace(type, "");
var temp = domain.split('/')[0].split(".");
var tt = temp.join(".");
domain = temp.reverse().join(".");
var path = url.replace(type, "").replace(tt, "");
var id = domain + type1 + path;
return id;
}
/**
Returns accepted or rejected status based on the regexes in config.
@private
@param {String} url
@param {String} domain
@return {boolean}
*/
function isAccepted(url, domain) {
//console.log("in isAccepted", message.get('links_store'));
var dir_length = url.replace("http://","").replace("https://").split('/').length - 1; //remove slash of domain
var domain_data = message.get('links_store')[domain];
if(check.assigned(domain_data)){
var allowed = domain_data['limit_depth'];
//console.log("allowed ", allowed, " ",url);
if( allowed !== -1 ){
if(dir_length > allowed ){
//console.log('rejected ', url, ' dir_length ', dir_length, ' allowed ', allowed);
return false;
}
}
}
if (!check.assigned(url.match(regex_urlfilter.accept))) { //user give acceptance
return false;
}
if (!config.getConfig("external_links")) {
if (url.indexOf(domain) < 0) {
return false;
}
}
for (var i = 0; i < regex_urlfilter.reject.length; i++) {
if (check.assigned(url.match(regex_urlfilter.reject[i]))) {
//matched by reject regex
if (check.assigned(url.match(config.getConfig("tika_supported_files")))) {
//matched by both reject regex and tika_supported_files
if (!config.getConfig("tika")) {
//if tika not selected then return false
return false;
}
} else {
//if not matched by tika supported files
return false;
}
}
};
return true;
}
/**
Returns a URL object. With url details and helper methods.
@param {String} url_input
@param {String} d - domain
@param {String} p - parent
@public
*/
this.url = function(url_input, d, p) {
var domain = d;
var url = url_input;
var parent = p;
/**
Returns JSON having url details
@private
@constructor
@param {String} url
*/
function URL(url) {
if (typeof(url) !== 'string') {
return null;
}
var url_obj = {};
url_obj["accepted"] = true;
//check if absolute and relative
if (url.indexOf("https://") < 0 && url.indexOf("http://") < 0) {
//relative
if (!check.assigned(domain)) {
//if given url is relative and no domain is specified then reject the url
url_obj["accepted"] = false;
} else {
url = urllib.resolve(domain, url);
}
} else {
//absolute
if (!check.assigned(domain)) {
domain = normalizeDomain(extractDomain(url));
} else {
domain = normalizeDomain(d);
}
}
url = normalizeURL(url);
url_obj["url"] = url;
url_obj["domain"] = domain;
if (!check.assigned(parent)) {
url_obj["parent"] = null;
} else {
url_obj["parent"] = normalizeURL(parent);
}
url_obj["nutch_key"] = nutchStyleURLKey(url);
if (url_obj["accepted"]) {
//if not rejected by above code then run isAccepted()
url_obj["accepted"] = isAccepted(url, domain);
}
if(!url_obj["accepted"]){
//console.log("failed");
//msg(url_obj["url"] + " got rejected by filters", "error");
}
url_obj["status_code"] = null;
url_obj["response_time"] = null;
url_obj["content"] = null;
url_obj["parsed_content"] = null;
url_obj["isParsed"] = false;
url_obj["isIndexed"] = false;
url_obj["file_type"] = getFileType(url);
url_obj['redirect'] = null;
url_obj['bucket_id'] = null;
url_obj['urlID'] = null;
url_obj['alternate_urls'] = [];
url_obj['canonical_url'] = null;
url_obj["content_md5"] = null;
url_obj["header_content_type"] = null;
url_obj["normal_queue"] = true;
url_obj['level'] = url_obj["url"].replace('http://', '').split('/').length;
var rep = ObjectX.clone(url_obj);
Object.seal(rep);
this.details = rep;
//setters
/**
Set normal queue
@public
*/
this.setNormalQueue = function() {
url_obj["normal_queue"] = true;
rep = ObjectX.clone(url_obj);
Object.seal(rep);
this.details = rep;
};
/**
Set failed queue
@public
*/
this.setFailedQueue = function() {
url_obj["normal_queue"] = false;
rep = ObjectX.clone(url_obj);
Object.seal(rep);
this.details = rep;
};
/**
Set header content type
@public
@param {String} header
*/
this.setHeaderContentType = function(header) {
url_obj["header_content_type"] = header;
rep = ObjectX.clone(url_obj);
Object.seal(rep);
this.details = rep;
};
/**
Add alternate language urls
@public
@param {String} l - url
@param {String} lang - language
*/
this.addAlternateUrl = function(l, lang) {
if (l.indexOf("http://") === 0 || l.indexOf("https://") === 0) {
l = new URL(l); //parsing the url
} else {
//relative link provide domain;
l = new URL(l, this.details.domain); //parsing the url
}
if (!check.assigned(l) || !check.assigned(l.details)) {
return;
}
l = l.details.url;
url_obj["alternate_urls"].push(l);
rep = ObjectX.clone(url_obj);
Object.seal(rep);
this.details = rep;
};
/**
Set Canonical urls, which points the page having same content
@public
@param {String} l - url
*/
this.setCanonicalUrl = function(l) {
if (l.indexOf("http://") === 0 || l.indexOf("https://") === 0) {
l = new URL(l); //parsing the url
} else {
//relative link provide domain;
l = new URL(l, this.details.domain); //parsing the url
}
if (!check.assigned(l) || !check.assigned(l.details)) {
return;
}
l = l.details.url;
url_obj["canonical_url"] = l;
rep = ObjectX.clone(url_obj);
Object.seal(rep);
this.details = rep;
};
/**
Set md5 hash of content this URL has.
@public
@param {String} l - string rep of hash
*/
this.setContentMd5 = function(l) {
url_obj["content_md5"] = l;
rep = ObjectX.clone(url_obj);
Object.seal(rep);
this.details = rep;
};
/**
Set urlId
@public
@param {String} code - the urlId assigned by mongodb on insert
*/
this.setUrlId = function(code) {
url_obj["urlID"] = code;
rep = ObjectX.clone(url_obj);
Object.seal(rep);
this.details = rep;
};
/**
Set redirectUrl
@public
@param {String} url
*/
this.setRedirectedURL = function(url) {
url_obj["redirect"] = normalizeURL(url);
rep = ObjectX.clone(url_obj);
Object.seal(rep);
this.details = rep;
};
/**
Set bucketId
@public
@param {String} idd
*/
this.setBucketId = function(idd) {
url_obj["bucket_id"] = idd;
rep = ObjectX.clone(url_obj);
Object.seal(rep);
this.details = rep;
};
/**
Set response time
@public
@param {String} response
*/
this.setResponseTime = function(response) {
url_obj["response_time"] = response;
rep = ObjectX.clone(url_obj);
Object.seal(rep);
this.details = rep;
};
/**
Set crawled content
@public
@param {Object} content
*/
this.setContent = function(content) {
url_obj["content"] = content;
rep = ObjectX.clone(url_obj);
Object.seal(rep);
this.details = rep;
};
/**
Set parsed content
@public
@param {Object} content
*/
this.setParsed = function(parsed) {
url_obj["parsed_content"] = parsed;
url_obj["isParsed"] = true;
rep = ObjectX.clone(url_obj);
Object.seal(rep);
this.details = rep;
};
/**
Set status code recieved on this url
@public
@param {String} code
*/
this.setStatusCode = function(code) {
url_obj["status_code"] = code;
rep = ObjectX.clone(url_obj);
Object.seal(rep);
this.details = rep;
};
/**
Get md5 content.
@public
*/
this.getContentMd5 = function() {
return url_obj["content_md5"];
}
/**
Get bucket id.
@public
*/
this.getBucketId = function() {
return url_obj['bucket_id'];
}
/**
Get status code.
@public
*/
this.getStatusCode = function() {
return url_obj["status_code"];
};
/**
Get url.
@public
*/
this.getURL = function() {
return url_obj["url"];
};
/**
Get urlId.
@public
*/
this.getUrlId = function() {
return url_obj["urlID"];
};
/**
Get redirect url.
@public
*/
this.getRedirectedURL = function() {
return url_obj["redirect"];
};
/**
Get domain.
@public
*/
this.getDomain = function() {
return url_obj["domain"];
};
/**
Get nutch style url rep.
@public
*/
this.getNutchKey = function() {
return url_obj["nutch_key"];
};
/**
Get accepted or rejected status.
@public
*/
this.isAccepted = function() {
return url_obj["accepted"];
};
/**
Get parent of this url.
@public
*/
this.getParent = function() {
return url_obj["parent"];
};
/**
Get response time.
@public
*/
this.getResponseTime = function() {
return url_obj["response_time"];
};
/**
Get HTML content.
@public
*/
this.getHTMLContent = function() {
return url_obj["content"];
};
/**
Get parsed content.
@public
*/
this.getParsedContent = function() {
return url_obj["parsed_content"];
};
/**
Returns indexed status.
@public
*/
this.isIndexed = function() {
return url_obj["isIndexed"];
};
/**
Returns parsed status.
@public
*/
this.isParsed = function() {
return url_obj["isParsed"];
};
};
return new URL(url);
};
function msg() {
if (process.RUN_ENV !== "TEST") {
if(!check.assigned(log)){
console.log(arguments[0], arguments[1]);
}else{
log.put(arguments[0], arguments[1], __filename.split('/').pop());
}
}
}
};
module.exports = URLCreator;