1 /*! 
  2  * pjscrape Copyright 2011 Nick Rabinowitz.
  3  * Licensed under the MIT License (see LICENSE.txt)
  4  */
  5 
  6 /**
  7  * @overview
  8  * <p>Scraping harness for PhantomJS. Requires PhantomJS or PyPhantomJS v.1.3
  9  * </p>
 10  *
 11  * @name pjscrape.js
 12  * @author Nick Rabinowitz (www.nickrabinowitz.com)
 13  * @version 0.1
 14  */
 15 
 16 var fs = require('fs');
 17 
 18 phantom.injectJs('lib/md5.js');
 19 
 20 function fail(msg) {
 21     console.log('FATAL ERROR: ' + msg);
 22     phantom.exit(1);
 23 };
 24 
 25 /**
 26  * @namespace
 27  * Root namespace for PhantomJS-side code
 28  * @name pjs
 29  */
 30 var pjs = (function(){
 31     var config = {
 32             timeoutInterval: 100,
 33             timeoutLimit: 3000,
 34             log: 'stdout',
 35             writer: 'stdout',
 36             format: 'json',
 37             logFile: 'pjscrape_log.txt',
 38             outFile: 'pjscrape_out.txt'
 39         };
 40         
 41     var suites = [];
 42         
 43         
 44     // utils
 45     function isFunction(f) {
 46         return typeof f === 'function';
 47     }
 48     function isObject(o) {
 49         return typeof o === 'object';
 50     }
 51     function funcify(f) {
 52         return isFunction(f) ? f : function() { return f };
 53     }
 54     function isArray(a) {
 55         return Array.isArray(a);
 56     }
 57     function arrify(a) {
 58         return isArray(a) ? a : a ? [a] : [];
 59     }
 60     function getKeys(o) {
 61         var keys = [];
 62         for (var key in o) keys.push(key);
 63         return keys;
 64     }
 65     function extend(obj) {
 66         Array.prototype.slice.call(arguments, 1).forEach(function(source) {
 67             for (var prop in source) {
 68                 if (source[prop] !== void 0) obj[prop] = source[prop];
 69             }
 70         });
 71         return obj;
 72     };
 73 
 74     /**
 75      * @name pjs.loggers
 76      * @namespace
 77      * Logger namespace. You can add new loggers here; new logger classes
 78      * should probably extend pjs.loggers.base and redefine the 
 79      * <code>log</code> method.
 80      * @example
 81         // create a new logger
 82         pjs.loggers.myLogger = function() {
 83             return new pjs.loggers.base(function(msg) { 
 84                 // do some special logging stuff
 85             });
 86         };
 87         // tell pjscrape to use your logger
 88         pjs.config({
 89             log: 'myLogger'
 90         });
 91      */
 92     var loggers = {
 93     
 94         /** 
 95          * @name pjs.loggers.base
 96          * @class Abstract base logger class
 97          * @private
 98          */
 99         base: function(logf) {
100             var log = this;
101             log.log = logf || function(msg) { console.log(msg) };
102             log.msg = function(msg) { log.log('* ' + msg) };
103             log.alert = function(msg) { log.log('! ' + msg) };
104             log.error = function(msg) { log.log('ERROR: ' + msg) };
105         },
106         
107         /** 
108          * Log to config.logFile
109          * @name pjs.loggers.file
110          * @type Logger
111          */
112         file: function() {
113             return new loggers.base(function(msg) { 
114                 fs.write(config.logFile, msg + "\n", 'a');
115             });
116         },
117         
118         /**
119          * Disable logging
120          * @name pjs.loggers.none
121          * @type Logger
122          */
123         none: function() {
124             return new loggers.base(function() {});
125         }
126     };
127         
128     /**
129      * Log to STDOUT
130      * @name pjs.loggers.stdout
131      * @type Logger
132      */
133     loggers.stdout = loggers.base;
134 
135     /**
136      * @name pjs.formatters
137      * @namespace
138      * Formatter namespace. You can add new formatters here; new formatter classes
139      * should have the properties start</code>, <code>end</code>, and 
140      * <code>delimiter</code>, and the method <code>format(item)</code>. You might
141      * save some time by inheriting from formatters.raw or formatters.json.
142      * @example
143         // create a new formatter
144         pjs.formatters.pipe = function() {
145             var f = new pjs.formatters.raw();
146             f.delimiter = '|';
147             return f;
148         };
149         // tell pjscrape to use your formatter
150         pjs.config({
151             format: 'pipe'
152         });
153      */
154     var formatters = {
155         
156         /** 
157          * Raw formatter - just uses toString()
158          * @name pjs.formatters.raw 
159          * @type Formatter
160          */
161         raw: function() {
162             var f = this;
163             f.start = f.end = f.delimiter = '';
164             f.format = function(item) {
165                 return item.toString();
166             };
167         },
168         
169         /** 
170          * Format output as a JSON array
171          * @name pjs.formatters.json 
172          * @type Formatter
173          */
174         json: function() {
175             var f = this;
176             f.start = '[';
177             f.end = ']';
178             f.delimiter = ',';
179             f.format = function(item) {
180                 return JSON.stringify(item);
181             };
182         },
183         
184         /** 
185          * CSV formatter - takes arrays or objects, fields defined by 
186          * config.csvFields or auto-generated based on first item
187          * @name pjs.formatters.csv 
188          * @type Formatter
189          */
190         csv: function() {
191             var f = this,
192                 fields = config.csvFields,
193                 makeRow = function(a) { return a.map(JSON.stringify).join(',') };
194                 
195             f.delimiter = "\r\n";
196             f.start = fields ? makeRow(fields) + f.delimiter : '';
197             f.end = '';
198             f.format = function(item) {
199                 if (item && typeof item == 'object') {
200                     var out = '';
201                     // make fields if not defined
202                     if (!fields) {
203                         if (isArray(item)) {
204                             fields = [];
205                             for (var i=0; i<item.length; i++) fields[i] = 'Column ' + (i+1);
206                         } else fields = getKeys(item);
207                         out = makeRow(fields) + f.delimiter;
208                     }
209                     // make an array out of an object if necessary
210                     if (!isArray(item)) {
211                         var tmp = [];
212                         fields.forEach(function(field) {
213                             tmp.push(item[field] || '');
214                         });
215                         item = tmp;
216                     }
217                     return out + item
218                         // too long?
219                         .slice(0, fields.length)
220                         // too short?
221                         .concat(item.length < fields.length ? 
222                             new Array(fields.length - item.length) :
223                             [])
224                         // quote strings if necessary, etc
225                         .map(function(v) {
226                             // escape double quotes with two double quotes
227                             return JSON.stringify(v).replace(/\\"/g, '""');
228                         })
229                         .join(',');
230                 }
231             };
232         }
233     };
234 
235     /**
236      * @name pjs.writers
237      * @namespace
238      * <p>Writer namespace. You can add new writers here; new writer classes
239      * should probably extend pjs.writers.base and redefine the 
240      * <code>write</code> method.</p>
241      * <p>Items returned by scrapers will be added to the output via
242      * <code>Writer.add(item)</code>, which can take any type of object. If
243      * an array is provided, multipled items will be added.
244      * @example
245         // create a new writer
246         pjs.writers.myWriter = function(log) {
247             var w = new pjs.writers.base(log);
248             w.write = function(s) {
249                 // write s to some special place
250             }
251             return w;
252         };
253         // tell pjscrape to use your writer
254         pjs.config({
255             writer: 'myWriter'
256         });
257      */
258     var writers = {
259         /** 
260          * @name pjs.writers.base
261          * @class Abstract base writer class
262          * @private
263          */
264         base: function(log) {
265             var w = this,
266                 count = 0,
267                 items = [],
268                 batchSize = config.batchSize,
269                 format = config.format || 'json',
270                 firstWrite = true,
271                 lastWrite = false;
272             
273             // init formatter
274             var formatter = new formatters[format]();
275             
276             // write output
277             var writeBatch = function(batch) {
278                 log.msg('Writing ' + batch.length + ' items');
279                 w.write(
280                     (firstWrite ? formatter.start : formatter.delimiter) +
281                     batch.map(formatter.format).join(formatter.delimiter) +
282                     (lastWrite ? formatter.end : '')
283                 );
284                 firstWrite = false;
285             };
286             
287             /** 
288              * Add an item to be written to output
289              * @name pjs.writers.base#add 
290              * @function
291              * @param {Object|String|Array} Item to add
292              */
293             w.add = function(i) {
294                 // add to items
295                 if (i) {
296                     i = arrify(i);
297                     items = items.concat(i);
298                     count += i.length;
299                     // write if necessary
300                     if (batchSize && items.length > batchSize) {
301                         writeBatch(items.splice(0, batchSize));
302                     }
303                 }
304             };
305             
306             /** 
307              * Finish up writing output
308              * @name pjs.writers.base#finish 
309              * @function
310              */
311             w.finish = function() {
312                 lastWrite = true;
313                 writeBatch(items);
314             };
315             
316             /** 
317              * Get the number of items written to output
318              * @name pjs.writers.base#count 
319              * @function
320              * @return {Number}     Number of items written
321              */
322             w.count = function() {
323                 return count;
324             };
325             
326             /** 
327              * Write a string to output
328              * @name pjs.writers.base#write 
329              * @function
330              * @param {String} s    String to write
331              */
332             w.write = function(s) { 
333                 console.log(s);
334             };
335         },
336         
337         /** 
338          * Writes output to config.outFile
339          * @name pjs.writers.file 
340          * @type Writer
341          */
342         file: function(log) {
343             var w = new writers.base(log);
344             // clear file
345             fs.write(config.outFile, '', 'w');
346             // write method
347             w.write = function(s) { 
348                 fs.write(config.outFile, s, 'a');
349             };
350             return w;
351         },
352         
353         /** 
354          * Writes output to one file per item. Items may be provided
355          * in the format <code>{ filename: "file.txt", content: "string" }</code>
356          * if you'd like to specify the filename in the scraper. Otherwise,
357          * files are written to config.outFile with serial numbering.
358          * @name pjs.writers.itemfile 
359          * @type Writer
360          */
361         itemfile: function(log) {
362             var w = this,
363                 count = 0,
364                 format = config.format || 'raw',
365                 formatter = new formatters[format]();
366             
367             w.add = function(items) {
368                 // add to items
369                 if (items) {
370                     items = arrify(items);
371                     // write to separate files
372                     items.forEach(function(item) {
373                         var filename;
374                         // support per-item filename syntax
375                         if (item.filename && item.content) {
376                             filename = item.filename;
377                             item = item.content;
378                         } 
379                         // otherwise add a serial number to config.outFile
380                         else {
381                             var fileparts = config.outFile.split('.'),
382                                 ext = fileparts.pop();
383                             filename = fileparts.join('.') + '-' + (count++) + '.' + ext;
384                         }
385                         fs.write(filename, formatter.format(item), 'w');
386                         count++;
387                     });
388                 }
389             };
390             
391             w.finish = function() {};
392             
393             w.count = function() {
394                 return count;
395             };
396         },
397     };
398         
399     /**
400      * Write output to STDOUT
401      * @name pjs.writers.stdout
402      * @type Writer
403      */
404     writers.stdout = writers.base;
405     
406     /**
407      * @name pjs.hashFunctions
408      * @namespace
409      * Hash function namespace. You can add new hash functions here; hash functions
410      * should take an item and return a unique (or unique-enough) string. 
411      * @example
412         // create a new hash function
413         pjs.hashFunctions.myHash = function(item) {
414             return item.mySpecialUID;
415         };
416         // tell pjscrape to ignore dupes
417         pjs.config({
418             ignoreDuplicates: true
419         });
420         // tell pjscrape to use your hash function
421         pjs.addScraper({
422             hashFunction: 'myHash',
423             // etc
424         });
425      */
426     var hashFunctions = {
427         /** UID hash - assumes item.id; falls back on md5
428          * @name pjs.hashFunctions.id
429          * @type HashFunction
430          */
431         id: function(item) {
432             return ('id' in item) ? item.id : hashFunctions.md5(item);
433         },
434         /** md5 hash - collisions are possible
435          * @name pjs.hashFunctions.md5
436          * @type HashFunction
437          */
438         md5: function(item) {
439             return md5(JSON.stringify(item));
440         }
441     };
442      
443 
444     // suite runner
445     var runner = (function() {
446         var visited = {},
447             itemHashes = {},
448             log, 
449             writer;
450         
451         /**
452          * @class
453          * Singleton: Manage multiple suites
454          * @private
455          */
456         var SuiteManager = new function() {
457             var mgr = this,
458                 complete,
459                 suiteq = [];
460                 
461             // create a single WebPage object for reuse
462             var page = require('webpage').create({
463                 // set up console output
464                 onConsoleMessage: function(msg, line, id) {
465                     // kill initialization message
466                     if (msg.indexOf('___') === 0) return;
467                     id = id || 'injected code';
468                     if (line) msg += ' (' + id + ' line ' + line + ')';
469                     log.msg('CLIENT: ' + msg);
470                 },
471                 onAlert: function(msg) { log.alert('CLIENT: ' + msg) }
472             });
473             
474             // add waitFor method
475             page.waitFor = function(test, callback) {
476                 // check for short-circuit
477                 if (this.evaluate(test)) {
478                     callback(page);
479                 } else {
480                     // poll until timeout or success
481                     var elapsed = 0,
482                         timeoutId = window.setInterval(function() {
483                             if (page.evaluate(test) || elapsed > config.timeoutLimit) {
484                                 if (elapsed > config.timeoutLimit) {
485                                     log.alert('Timeout after ' + ~~(elapsed / 1000) + ' seconds');
486                                 }
487                                 window.clearInterval(timeoutId);
488                                 callback(page);
489                             } else {
490                                 elapsed += config.timeoutInterval;
491                             }
492                         }, config.timeoutInterval);
493                 }
494             };
495             
496             mgr.getPage = function() {
497                 return page;
498             };
499             
500             // set the completion callback
501             mgr.setComplete = function(cb) {
502                 complete = cb;
503             };
504             
505             // add a ScraperSuite
506             mgr.add = function(suite) {
507                 suiteq.push(suite);
508             };
509             
510             // run the next ScraperSuite in the queue
511             mgr.runNext = function() {
512                 var suite = suiteq.shift();
513                 if (suite) suite.run();
514                 else complete();
515             };
516         }();
517         
518         /**
519          * @class
520          * Scraper suite class - represents a set of urls to scrape
521          * @private
522          * @param {String} title        Title for verbose output
523          * @param {String[]} urls       Urls to scrape
524          * @param {Object} opts         Configuration object
525          */
526         var ScraperSuite = function(title, urls, opts) {
527             var s = this,
528                 truef = function() { return true };
529             // set up options
530             s.title = title;
531             s.urls = urls;
532             s.opts = extend({
533                 ready: function() { return _pjs.ready; },
534                 scrapable: truef,
535                 preScrape: truef,
536                 hashFunction: hashFunctions.id
537             }, config, opts);
538             // deal with potential arrays and syntax variants
539             s.opts.loadScript = arrify(s.opts.loadScripts || s.opts.loadScript);
540             s.opts.scrapers = arrify(s.opts.scrapers || s.opts.scraper);
541             // set up completion callback
542             s.complete = function() {
543                 log.msg(s.title + " complete");
544                 SuiteManager.runNext();
545             };
546             s.depth = 0;
547         }
548         
549         ScraperSuite.prototype = {
550         
551             /**
552              * Add an item, checking for duplicates as necessary
553              * @param {Object|Array} items      Item(s) to add
554              * @private
555              */
556             addItem: function(items) {
557                 var s = this;
558                 if (items && config.ignoreDuplicates) {
559                     // ensure array
560                     items = arrify(items);
561                     items = items.filter(function(item) {
562                         var hash = s.opts.hashFunction(item);
563                         if (!itemHashes[hash]) {
564                             // hash miss - new item
565                             itemHashes[hash] = true;
566                             return true;
567                         } else {
568                             // hash hit - likely duplicate
569                             // Could do a second-layer check against the actual object,
570                             // but that requires retaining items in memory - skip for now
571                             return false;
572                         }
573                     });
574                 }
575                 writer.add(items);
576             },
577             
578             /**
579              * Run the suite, scraping each url
580              * @private
581              */
582             run: function() {
583                 var s = this,
584                     scrapers = s.opts.scrapers,
585                     i = 0,
586                     // get base URL for avoiding repeat visits and recursion loops
587                     baseUrl = function(url) {
588                         return s.opts.newHashNewPage ? url.split('#')[0] : url;
589                     },
590                     // completion callback
591                     complete = function(page) {
592                         // recurse if necessary
593                         if (page && s.opts.moreUrls) {
594                             // allow selector-only spiders
595                             if (typeof s.opts.moreUrls == 'string') {
596                                 s.opts.moreUrls = new Function(
597                                     "return _pjs.getAnchorUrls('" + s.opts.moreUrls + "');"
598                                 );
599                             }
600                             // look for more urls on this page
601                             var moreUrls = page.evaluate(s.opts.moreUrls);
602                             if (moreUrls && (!s.opts.maxDepth || s.depth < s.opts.maxDepth)) {
603                                 if (moreUrls.length) {
604                                     log.msg('Found ' + moreUrls.length + ' additional urls to scrape');
605                                     // make a new sub-suite
606                                     var ss = new ScraperSuite(s.title + '-sub' + i++, moreUrls, s.opts);
607                                     ss.depth = s.depth + 1;
608                                     SuiteManager.add(ss);
609                                 }
610                             }
611                         }
612                         runNext();
613                     },
614                     runCounter = 0;
615                 // run each
616                 function runNext() {
617                     if (runCounter < s.urls.length) {
618                         url = baseUrl(s.urls[runCounter++]);
619                         // avoid repeat visits
620                         if (!config.allowRepeatUrls && url in visited) {
621                             runNext();
622                         } else {
623                             // scrape this url
624                             s.scrape(url, scrapers, complete);
625                         }
626                     } else {
627                         s.complete();
628                     }
629                 }
630                 log.msg(s.title + " starting");
631                 runNext();
632             },
633             
634             /**
635              * Scrape a single page.
636              * @param {String} url          Url of page to scrape
637              * @param {Function} scrapePage Function to scrape page with
638              * @param {Function} complete   Callback function to run when complete
639              * @private
640              */
641             scrape: function(url, scrapers, complete) {
642                 var suite = this,
643                     opts = suite.opts,
644                     page = SuiteManager.getPage();
645                 log.msg('Opening ' + url);
646                 // set up callback to look for response codes
647                 page.onResourceReceived = function(res) {
648                     if (res.stage == 'end' && res.url == url) {
649                         page.resource = res;
650                     }
651                 };
652                 // run the scrape
653                 page.open(url, function(status) {
654                     // check for load errors
655                     if (status != "success") {
656                         log.error('Page did not load (status=' + status + '): ' + url);
657                         complete(false);
658                     }
659                     // look for 4xx or 5xx status codes
660                     var statusCodeStart = String(page.resource.status).charAt(0);
661                     if (statusCodeStart == '4' || statusCodeStart == '5') {
662                         if (page.resource.status == 404) {
663                             log.error('Page not found: ' + url);
664                         } else {
665                             log.error('Page error code ' + page.resource.status + ' on ' + url);
666                         }
667                         complete(false);
668                     }
669                     // mark as visited
670                     visited[url] = true;
671                     log.msg('Scraping ' + url);
672                     // load jQuery
673                     page.injectJs('client/jquery.js');
674                     page.evaluate(function() { 
675                         window._pjs$ = jQuery.noConflict(true); 
676                     });
677                     // load pjscrape client-side code
678                     page.injectJs('client/pjscrape_client.js');
679                     // reset the global jQuery vars
680                     if (!opts.noConflict) {
681                         page.evaluate(function() {
682                             window.$ = window.jQuery = window._pjs$; 
683                         });
684                     }
685                     // run scraper(s)
686                     page.waitFor(opts.ready, function(page) {
687                         if (page.evaluate(opts.scrapable)) {
688                             // load script(s) if necessary
689                             if (opts.loadScript) {
690                                 opts.loadScript.forEach(function(script) {
691                                     page.injectJs(script);
692                                 })
693                             }
694                             // run prescrape
695                             page.evaluate(opts.preScrape);
696                             // run each scraper and send any results to writer
697                             if (scrapers && scrapers.length) {
698                                 // set up callback manager
699                                 var i = 0;
700                                 function checkComplete() {
701                                     if (++i == scrapers.length) {
702                                         complete(page);
703                                     }
704                                 }
705                                 // run all scrapers
706                                 scrapers.forEach(function(scraper) {
707                                     if (isFunction(scraper)) {
708                                         // standard scraper
709                                         suite.addItem(page.evaluate(scraper));
710                                         checkComplete();
711                                     } else if (typeof scraper == 'string') {
712                                         // selector-only scraper
713                                         suite.addItem(page.evaluate(new Function(
714                                             "return _pjs.getText('" + scraper + "');"
715                                         )));
716                                         checkComplete();
717                                     } else if (scraper.scraper) {
718                                         // wrapped scraper, more options (just async now)
719                                         if (scraper.async) {
720                                             // start the scrape
721                                             page.evaluate(scraper.scraper);
722                                             // wait for the scraper to return items
723                                             page.waitFor(
724                                                 function() {
725                                                     return _pjs.items !== undefined 
726                                                 },
727                                                 function() {
728                                                     suite.addItem(page.evaluate(function() {
729                                                         return _pjs.items;
730                                                     }));
731                                                     checkComplete();
732                                                 }
733                                             );
734                                         }
735                                     }
736                                 });
737                             }
738                         } else {
739                             complete(page);
740                         }
741                     });
742                 });
743             }
744         };
745         
746         /**
747          * Run the set of configured scraper suites.
748          * @name pjs.init
749          */
750         function init() {
751             // check requirements
752             if (!suites.length) fail('No suites configured');
753             if (!(config.log in loggers)) fail('Could not find logger: "' + config.log + '"');
754             if (!(config.writer in writers)) fail('Could not find writer "' + config.writer + '"');
755             
756             // init logger
757             log = new loggers[config.log]();
758             // init writer
759             writer = new writers[config.writer](log);
760             
761             // init suite manager
762             SuiteManager.setComplete(function() {
763                 // scrape complete
764                 writer.finish();
765                 log.msg('Saved ' + writer.count() + ' items');
766                 phantom.exit();
767             });
768             // make all suites
769             suites.forEach(function(suite, i) {
770                 SuiteManager.add(new ScraperSuite(
771                     suite.title || "Suite " + i, 
772                     arrify(suite.url || suite.urls),
773                     suite
774                 ));
775             });
776             // start the suite manager
777             SuiteManager.runNext();
778         }
779         
780         return {
781             init: init
782         }
783     }());
784 
785     // expose namespaces and API functions
786     return {
787         loggers: loggers,
788         formatters: formatters,
789         writers: writers,
790         hashFunctions: hashFunctions,
791         init: runner.init,
792 
793         /**
794          * Set one or more config variables, applying to all suites
795          * @name pjs.config
796          * @param {String|Object} key   Either a key to set or an object with
797          *                              multiple values to set
798          * @param {mixed} [val]         Value to set if using config(key, val) syntax
799          */
800         config: function(key, val) {
801             if (!key) {
802                 return config;
803             } else if (typeof key == 'object') {
804                 extend(config, key);
805             } else if (val) {
806                 config[key] = val;
807             }
808         },
809 
810         /**
811          * Add one or more scraper suites to be run.
812          * @name pjs.addSuite
813          * @param {Object} suite    Scraper suite configuration object
814          * @param {Object} [...]    More suite configuration objects
815          */
816         addSuite: function() { 
817             suites = Array.prototype.concat.apply(suites, arguments);
818         },
819 
820         /**
821          * Shorthand function to add a simple scraper suite.
822          * @name pjs.addScraper
823          * @param {String|String[]} url     URL or array of URLs to scrape
824          * @param {Function|Function[]}     Scraper function or array of scraper functions
825          */
826         addScraper: function(url, scraper) {
827             suites.push({url:url, scraper:scraper});
828         }
829     };
830 }());
831 
832  
833 // make sure we have a config file
834 if (!phantom.args.length) {
835     // die
836     console.log('Usage: pjscrape.js <configfile.js> ...');
837     phantom.exit();
838 } else {
839     // load the config file(s)
840     phantom.args.forEach(function(configFile) {
841         if (!phantom.injectJs(configFile)) {
842             fail('Config file not found: ' + configFile);
843         }
844     });
845 }
846 // start the scrape
847 pjs.init();
848 
849