Source: lib/robots.js

var parent_dir = process.getAbsolutePath(__dirname);
var request = require("request");
var fs = require('fs');
var check = require("check-types");
var robots = require('robots');

/**
	Represents Robots.txt parser
	@author Tilak Patidar <tilakpatidar@gmail.com>
	@constructor
	@param {Message} message_obj

*/

var Robots = function Robots(message_obj, parallel_requests) {


    var message = message_obj;
    var log = message.get('log');
    var pool = message.get('pool');
    var urls = message.get('robots_links');
    var that = this;
    var requests = parallel_requests;
    var requests_made = 0;
    var called = false;
    /**
    	Parses robots.txt for loaded urls.
    	@public
    	@param {Function} fn - callback

    */
    this.parse = function(fn) {

        try {
            msg("Preparing robots.txt files this will take time . . .", "info");
            var err;
            loadCache(function init_loadCache() {


                function robots_iterator() {
                    if (called) {
                        return;
                    }
                    msg("requests_made  " + requests_made + "    requests" + requests, "info");
                    var req_url = urls.splice(0, 1)[0];
                    //console.log(req_url);


                    if (!check.assigned(req_url)) {
                        --requests_made;
                        if (urls.length === 0 && !called) {
                            called = true;
                            return fn(err, that.bots);
                        }
                        return;
                    }

                    if (check.assigned(that.bots[req_url])) {
                        --requests_made;
                        if (urls.length === 0 && !called) {
                            called = true;
                            return fn(err, that.bots);
                        }
                        for (var i = requests_made; i < requests; i++) {
                            ++requests_made;
                            process.nextTick(robots_iterator);
                        }
                        return;

                    }




                    var parser = new robots.RobotsParser();
                    var req_url_new = req_url.split("/").slice(0, 3).join("/");
                    var init_time = new Date().getTime();
                    if (!check.assigned(that.bots[req_url])) {

                        parser.setUrl(req_url_new + '/robots.txt', function parser_setUrl(parser_obj, success) {

                            if (success) {
                                msg(("Robots.txt parsed for " + req_url), "success");
                                addToCache(req_url_new, parser_obj, function() {

                                });
                                that.bots[req_url] = parser_obj; //saving robots obj
                            } else {
                                msg(("No Robots.txt found for " + req_url), "error");
                                var parser_obj = {
                                    "NO_ROBOTS": true
                                };
                                addToCache(req_url_new, parser_obj, function() {

                                });
                                that.bots[req_url] = parser_obj; //saving robots obj

                            }

                            --requests_made;

                            if (urls.length === 0 && !called) {
                                called = true;
                                return fn(err, that.bots);
                            }

                            for (var i = requests_made; i < requests; i++) {
                                ++requests_made;
                                process.nextTick(robots_iterator);
                            }

                            return;

                        });
                    } else {

                        --requests_made;
                        if (urls.length === 0 && !called) {
                            called = true;
                            return fn(err, that.bots);
                        }
                        for (var i = requests_made; i < requests; i++) {
                            ++requests_made;
                            process.nextTick(robots_iterator);
                        }
                        return;
                    }





                };

                ++requests_made;
                process.nextTick(robots_iterator);



            });

        } catch (err) {
            //console.log(err)
            return fn(err, null);
        }

    };

    /**
    	Loades already parsed robots.txt from mongodb store.
    	@private
    	@param {Function} fn - callback
    */
    var loadCache = function loadCache(fn) {

        pool.robots_collection.find({}).toArray(function loadCacheFn(err, docs) {
            //console.log(err,docs)
            if (!check.assigned(docs) || check.emptyArray(docs) || check.assigned(err)) {
                msg("Robots.txt cache is empty.", "info");
                return fn();
            } else {
                for (var i = docs.length - 1; i >= 0; i--) {
                    var data = docs[i]["robot"];
                    var domain = docs[i]["_id"];
                    if (check.assigned(data)) {
                        that.bots[domain] = data;
                        msg(("Robots.txt loaded from cache for " + domain), "success");
                    }
                    if (i === 0) {
                        return fn();
                    }
                };

            }
        });

    };

    /**
    	Parsed data is stored in this JSON object.
    	@public
    */
    this.bots = {};

    /**
    	Adds parsed data for a url into mongodb.
    	@private
    	@param {String} key - url
    	@param {String} value - parsed data from robots.txt

    */
    function addToCache(key, value, fn) {
        pool.robots_collection.insert({
            "_id": key,
            "robot": value
        }, function addToCache(err) {
            return fn();
        });
    };

    function msg() {
        log.put(arguments[0], arguments[1], __filename.split('/').pop(), arguments.callee.caller.name.toString());
    }
};

module.exports = Robots;