1 /*! 2 * pjscrape Copyright 2011 Nick Rabinowitz. 3 * Licensed under the MIT License (see LICENSE.txt) 4 */ 5 6 /** 7 * @overview 8 * <p>Scraping harness for PhantomJS. Requires PhantomJS or PyPhantomJS v.1.3 9 * </p> 10 * 11 * @name pjscrape.js 12 * @author Nick Rabinowitz (www.nickrabinowitz.com) 13 * @version 0.1 14 */ 15 16 var fs = require('fs'); 17 18 phantom.injectJs('lib/md5.js'); 19 20 function fail(msg) { 21 console.log('FATAL ERROR: ' + msg); 22 phantom.exit(1); 23 }; 24 25 /** 26 * @namespace 27 * Root namespace for PhantomJS-side code 28 * @name pjs 29 */ 30 var pjs = (function(){ 31 var config = { 32 timeoutInterval: 100, 33 timeoutLimit: 3000, 34 log: 'stdout', 35 writer: 'stdout', 36 format: 'json', 37 logFile: 'pjscrape_log.txt', 38 outFile: 'pjscrape_out.txt' 39 }; 40 41 var suites = []; 42 43 44 // utils 45 function isFunction(f) { 46 return typeof f === 'function'; 47 } 48 function isObject(o) { 49 return typeof o === 'object'; 50 } 51 function funcify(f) { 52 return isFunction(f) ? f : function() { return f }; 53 } 54 function isArray(a) { 55 return Array.isArray(a); 56 } 57 function arrify(a) { 58 return isArray(a) ? a : a ? [a] : []; 59 } 60 function getKeys(o) { 61 var keys = []; 62 for (var key in o) keys.push(key); 63 return keys; 64 } 65 function extend(obj) { 66 Array.prototype.slice.call(arguments, 1).forEach(function(source) { 67 for (var prop in source) { 68 if (source[prop] !== void 0) obj[prop] = source[prop]; 69 } 70 }); 71 return obj; 72 }; 73 74 /** 75 * @name pjs.loggers 76 * @namespace 77 * Logger namespace. You can add new loggers here; new logger classes 78 * should probably extend pjs.loggers.base and redefine the 79 * <code>log</code> method. 80 * @example 81 // create a new logger 82 pjs.loggers.myLogger = function() { 83 return new pjs.loggers.base(function(msg) { 84 // do some special logging stuff 85 }); 86 }; 87 // tell pjscrape to use your logger 88 pjs.config({ 89 log: 'myLogger' 90 }); 91 */ 92 var loggers = { 93 94 /** 95 * @name pjs.loggers.base 96 * @class Abstract base logger class 97 * @private 98 */ 99 base: function(logf) { 100 var log = this; 101 log.log = logf || function(msg) { console.log(msg) }; 102 log.msg = function(msg) { log.log('* ' + msg) }; 103 log.alert = function(msg) { log.log('! ' + msg) }; 104 log.error = function(msg) { log.log('ERROR: ' + msg) }; 105 }, 106 107 /** 108 * Log to config.logFile 109 * @name pjs.loggers.file 110 * @type Logger 111 */ 112 file: function() { 113 return new loggers.base(function(msg) { 114 fs.write(config.logFile, msg + "\n", 'a'); 115 }); 116 }, 117 118 /** 119 * Disable logging 120 * @name pjs.loggers.none 121 * @type Logger 122 */ 123 none: function() { 124 return new loggers.base(function() {}); 125 } 126 }; 127 128 /** 129 * Log to STDOUT 130 * @name pjs.loggers.stdout 131 * @type Logger 132 */ 133 loggers.stdout = loggers.base; 134 135 /** 136 * @name pjs.formatters 137 * @namespace 138 * Formatter namespace. You can add new formatters here; new formatter classes 139 * should have the properties start</code>, <code>end</code>, and 140 * <code>delimiter</code>, and the method <code>format(item)</code>. You might 141 * save some time by inheriting from formatters.raw or formatters.json. 142 * @example 143 // create a new formatter 144 pjs.formatters.pipe = function() { 145 var f = new pjs.formatters.raw(); 146 f.delimiter = '|'; 147 return f; 148 }; 149 // tell pjscrape to use your formatter 150 pjs.config({ 151 format: 'pipe' 152 }); 153 */ 154 var formatters = { 155 156 /** 157 * Raw formatter - just uses toString() 158 * @name pjs.formatters.raw 159 * @type Formatter 160 */ 161 raw: function() { 162 var f = this; 163 f.start = f.end = f.delimiter = ''; 164 f.format = function(item) { 165 return item.toString(); 166 }; 167 }, 168 169 /** 170 * Format output as a JSON array 171 * @name pjs.formatters.json 172 * @type Formatter 173 */ 174 json: function() { 175 var f = this; 176 f.start = '['; 177 f.end = ']'; 178 f.delimiter = ','; 179 f.format = function(item) { 180 return JSON.stringify(item); 181 }; 182 }, 183 184 /** 185 * CSV formatter - takes arrays or objects, fields defined by 186 * config.csvFields or auto-generated based on first item 187 * @name pjs.formatters.csv 188 * @type Formatter 189 */ 190 csv: function() { 191 var f = this, 192 fields = config.csvFields, 193 makeRow = function(a) { return a.map(JSON.stringify).join(',') }; 194 195 f.delimiter = "\r\n"; 196 f.start = fields ? makeRow(fields) + f.delimiter : ''; 197 f.end = ''; 198 f.format = function(item) { 199 if (item && typeof item == 'object') { 200 var out = ''; 201 // make fields if not defined 202 if (!fields) { 203 if (isArray(item)) { 204 fields = []; 205 for (var i=0; i<item.length; i++) fields[i] = 'Column ' + (i+1); 206 } else fields = getKeys(item); 207 out = makeRow(fields) + f.delimiter; 208 } 209 // make an array out of an object if necessary 210 if (!isArray(item)) { 211 var tmp = []; 212 fields.forEach(function(field) { 213 tmp.push(item[field] || ''); 214 }); 215 item = tmp; 216 } 217 return out + item 218 // too long? 219 .slice(0, fields.length) 220 // too short? 221 .concat(item.length < fields.length ? 222 new Array(fields.length - item.length) : 223 []) 224 // quote strings if necessary, etc 225 .map(function(v) { 226 // escape double quotes with two double quotes 227 return JSON.stringify(v).replace(/\\"/g, '""'); 228 }) 229 .join(','); 230 } 231 }; 232 } 233 }; 234 235 /** 236 * @name pjs.writers 237 * @namespace 238 * <p>Writer namespace. You can add new writers here; new writer classes 239 * should probably extend pjs.writers.base and redefine the 240 * <code>write</code> method.</p> 241 * <p>Items returned by scrapers will be added to the output via 242 * <code>Writer.add(item)</code>, which can take any type of object. If 243 * an array is provided, multipled items will be added. 244 * @example 245 // create a new writer 246 pjs.writers.myWriter = function(log) { 247 var w = new pjs.writers.base(log); 248 w.write = function(s) { 249 // write s to some special place 250 } 251 return w; 252 }; 253 // tell pjscrape to use your writer 254 pjs.config({ 255 writer: 'myWriter' 256 }); 257 */ 258 var writers = { 259 /** 260 * @name pjs.writers.base 261 * @class Abstract base writer class 262 * @private 263 */ 264 base: function(log) { 265 var w = this, 266 count = 0, 267 items = [], 268 batchSize = config.batchSize, 269 format = config.format || 'json', 270 firstWrite = true, 271 lastWrite = false; 272 273 // init formatter 274 var formatter = new formatters[format](); 275 276 // write output 277 var writeBatch = function(batch) { 278 log.msg('Writing ' + batch.length + ' items'); 279 w.write( 280 (firstWrite ? formatter.start : formatter.delimiter) + 281 batch.map(formatter.format).join(formatter.delimiter) + 282 (lastWrite ? formatter.end : '') 283 ); 284 firstWrite = false; 285 }; 286 287 /** 288 * Add an item to be written to output 289 * @name pjs.writers.base#add 290 * @function 291 * @param {Object|String|Array} Item to add 292 */ 293 w.add = function(i) { 294 // add to items 295 if (i) { 296 i = arrify(i); 297 items = items.concat(i); 298 count += i.length; 299 // write if necessary 300 if (batchSize && items.length > batchSize) { 301 writeBatch(items.splice(0, batchSize)); 302 } 303 } 304 }; 305 306 /** 307 * Finish up writing output 308 * @name pjs.writers.base#finish 309 * @function 310 */ 311 w.finish = function() { 312 lastWrite = true; 313 writeBatch(items); 314 }; 315 316 /** 317 * Get the number of items written to output 318 * @name pjs.writers.base#count 319 * @function 320 * @return {Number} Number of items written 321 */ 322 w.count = function() { 323 return count; 324 }; 325 326 /** 327 * Write a string to output 328 * @name pjs.writers.base#write 329 * @function 330 * @param {String} s String to write 331 */ 332 w.write = function(s) { 333 console.log(s); 334 }; 335 }, 336 337 /** 338 * Writes output to config.outFile 339 * @name pjs.writers.file 340 * @type Writer 341 */ 342 file: function(log) { 343 var w = new writers.base(log); 344 // clear file 345 fs.write(config.outFile, '', 'w'); 346 // write method 347 w.write = function(s) { 348 fs.write(config.outFile, s, 'a'); 349 }; 350 return w; 351 }, 352 353 /** 354 * Writes output to one file per item. Items may be provided 355 * in the format <code>{ filename: "file.txt", content: "string" }</code> 356 * if you'd like to specify the filename in the scraper. Otherwise, 357 * files are written to config.outFile with serial numbering. 358 * @name pjs.writers.itemfile 359 * @type Writer 360 */ 361 itemfile: function(log) { 362 var w = this, 363 count = 0, 364 format = config.format || 'raw', 365 formatter = new formatters[format](); 366 367 w.add = function(items) { 368 // add to items 369 if (items) { 370 items = arrify(items); 371 // write to separate files 372 items.forEach(function(item) { 373 var filename; 374 // support per-item filename syntax 375 if (item.filename && item.content) { 376 filename = item.filename; 377 item = item.content; 378 } 379 // otherwise add a serial number to config.outFile 380 else { 381 var fileparts = config.outFile.split('.'), 382 ext = fileparts.pop(); 383 filename = fileparts.join('.') + '-' + (count++) + '.' + ext; 384 } 385 fs.write(filename, formatter.format(item), 'w'); 386 count++; 387 }); 388 } 389 }; 390 391 w.finish = function() {}; 392 393 w.count = function() { 394 return count; 395 }; 396 }, 397 }; 398 399 /** 400 * Write output to STDOUT 401 * @name pjs.writers.stdout 402 * @type Writer 403 */ 404 writers.stdout = writers.base; 405 406 /** 407 * @name pjs.hashFunctions 408 * @namespace 409 * Hash function namespace. You can add new hash functions here; hash functions 410 * should take an item and return a unique (or unique-enough) string. 411 * @example 412 // create a new hash function 413 pjs.hashFunctions.myHash = function(item) { 414 return item.mySpecialUID; 415 }; 416 // tell pjscrape to ignore dupes 417 pjs.config({ 418 ignoreDuplicates: true 419 }); 420 // tell pjscrape to use your hash function 421 pjs.addScraper({ 422 hashFunction: 'myHash', 423 // etc 424 }); 425 */ 426 var hashFunctions = { 427 /** UID hash - assumes item.id; falls back on md5 428 * @name pjs.hashFunctions.id 429 * @type HashFunction 430 */ 431 id: function(item) { 432 return ('id' in item) ? item.id : hashFunctions.md5(item); 433 }, 434 /** md5 hash - collisions are possible 435 * @name pjs.hashFunctions.md5 436 * @type HashFunction 437 */ 438 md5: function(item) { 439 return md5(JSON.stringify(item)); 440 } 441 }; 442 443 444 // suite runner 445 var runner = (function() { 446 var visited = {}, 447 itemHashes = {}, 448 log, 449 writer; 450 451 /** 452 * @class 453 * Singleton: Manage multiple suites 454 * @private 455 */ 456 var SuiteManager = new function() { 457 var mgr = this, 458 complete, 459 suiteq = []; 460 461 // create a single WebPage object for reuse 462 var page = require('webpage').create({ 463 // set up console output 464 onConsoleMessage: function(msg, line, id) { 465 // kill initialization message 466 if (msg.indexOf('___') === 0) return; 467 id = id || 'injected code'; 468 if (line) msg += ' (' + id + ' line ' + line + ')'; 469 log.msg('CLIENT: ' + msg); 470 }, 471 onAlert: function(msg) { log.alert('CLIENT: ' + msg) } 472 }); 473 474 // add waitFor method 475 page.waitFor = function(test, callback) { 476 // check for short-circuit 477 if (this.evaluate(test)) { 478 callback(page); 479 } else { 480 // poll until timeout or success 481 var elapsed = 0, 482 timeoutId = window.setInterval(function() { 483 if (page.evaluate(test) || elapsed > config.timeoutLimit) { 484 if (elapsed > config.timeoutLimit) { 485 log.alert('Timeout after ' + ~~(elapsed / 1000) + ' seconds'); 486 } 487 window.clearInterval(timeoutId); 488 callback(page); 489 } else { 490 elapsed += config.timeoutInterval; 491 } 492 }, config.timeoutInterval); 493 } 494 }; 495 496 mgr.getPage = function() { 497 return page; 498 }; 499 500 // set the completion callback 501 mgr.setComplete = function(cb) { 502 complete = cb; 503 }; 504 505 // add a ScraperSuite 506 mgr.add = function(suite) { 507 suiteq.push(suite); 508 }; 509 510 // run the next ScraperSuite in the queue 511 mgr.runNext = function() { 512 var suite = suiteq.shift(); 513 if (suite) suite.run(); 514 else complete(); 515 }; 516 }(); 517 518 /** 519 * @class 520 * Scraper suite class - represents a set of urls to scrape 521 * @private 522 * @param {String} title Title for verbose output 523 * @param {String[]} urls Urls to scrape 524 * @param {Object} opts Configuration object 525 */ 526 var ScraperSuite = function(title, urls, opts) { 527 var s = this, 528 truef = function() { return true }; 529 // set up options 530 s.title = title; 531 s.urls = urls; 532 s.opts = extend({ 533 ready: function() { return _pjs.ready; }, 534 scrapable: truef, 535 preScrape: truef, 536 hashFunction: hashFunctions.id 537 }, config, opts); 538 // deal with potential arrays and syntax variants 539 s.opts.loadScript = arrify(s.opts.loadScripts || s.opts.loadScript); 540 s.opts.scrapers = arrify(s.opts.scrapers || s.opts.scraper); 541 // set up completion callback 542 s.complete = function() { 543 log.msg(s.title + " complete"); 544 SuiteManager.runNext(); 545 }; 546 s.depth = 0; 547 } 548 549 ScraperSuite.prototype = { 550 551 /** 552 * Add an item, checking for duplicates as necessary 553 * @param {Object|Array} items Item(s) to add 554 * @private 555 */ 556 addItem: function(items) { 557 var s = this; 558 if (items && config.ignoreDuplicates) { 559 // ensure array 560 items = arrify(items); 561 items = items.filter(function(item) { 562 var hash = s.opts.hashFunction(item); 563 if (!itemHashes[hash]) { 564 // hash miss - new item 565 itemHashes[hash] = true; 566 return true; 567 } else { 568 // hash hit - likely duplicate 569 // Could do a second-layer check against the actual object, 570 // but that requires retaining items in memory - skip for now 571 return false; 572 } 573 }); 574 } 575 writer.add(items); 576 }, 577 578 /** 579 * Run the suite, scraping each url 580 * @private 581 */ 582 run: function() { 583 var s = this, 584 scrapers = s.opts.scrapers, 585 i = 0, 586 // get base URL for avoiding repeat visits and recursion loops 587 baseUrl = function(url) { 588 return s.opts.newHashNewPage ? url.split('#')[0] : url; 589 }, 590 // completion callback 591 complete = function(page) { 592 // recurse if necessary 593 if (page && s.opts.moreUrls) { 594 // allow selector-only spiders 595 if (typeof s.opts.moreUrls == 'string') { 596 s.opts.moreUrls = new Function( 597 "return _pjs.getAnchorUrls('" + s.opts.moreUrls + "');" 598 ); 599 } 600 // look for more urls on this page 601 var moreUrls = page.evaluate(s.opts.moreUrls); 602 if (moreUrls && (!s.opts.maxDepth || s.depth < s.opts.maxDepth)) { 603 if (moreUrls.length) { 604 log.msg('Found ' + moreUrls.length + ' additional urls to scrape'); 605 // make a new sub-suite 606 var ss = new ScraperSuite(s.title + '-sub' + i++, moreUrls, s.opts); 607 ss.depth = s.depth + 1; 608 SuiteManager.add(ss); 609 } 610 } 611 } 612 runNext(); 613 }, 614 runCounter = 0; 615 // run each 616 function runNext() { 617 if (runCounter < s.urls.length) { 618 url = baseUrl(s.urls[runCounter++]); 619 // avoid repeat visits 620 if (!config.allowRepeatUrls && url in visited) { 621 runNext(); 622 } else { 623 // scrape this url 624 s.scrape(url, scrapers, complete); 625 } 626 } else { 627 s.complete(); 628 } 629 } 630 log.msg(s.title + " starting"); 631 runNext(); 632 }, 633 634 /** 635 * Scrape a single page. 636 * @param {String} url Url of page to scrape 637 * @param {Function} scrapePage Function to scrape page with 638 * @param {Function} complete Callback function to run when complete 639 * @private 640 */ 641 scrape: function(url, scrapers, complete) { 642 var suite = this, 643 opts = suite.opts, 644 page = SuiteManager.getPage(); 645 log.msg('Opening ' + url); 646 // set up callback to look for response codes 647 page.onResourceReceived = function(res) { 648 if (res.stage == 'end' && res.url == url) { 649 page.resource = res; 650 } 651 }; 652 // run the scrape 653 page.open(url, function(status) { 654 // check for load errors 655 if (status != "success") { 656 log.error('Page did not load (status=' + status + '): ' + url); 657 complete(false); 658 } 659 // look for 4xx or 5xx status codes 660 var statusCodeStart = String(page.resource.status).charAt(0); 661 if (statusCodeStart == '4' || statusCodeStart == '5') { 662 if (page.resource.status == 404) { 663 log.error('Page not found: ' + url); 664 } else { 665 log.error('Page error code ' + page.resource.status + ' on ' + url); 666 } 667 complete(false); 668 } 669 // mark as visited 670 visited[url] = true; 671 log.msg('Scraping ' + url); 672 // load jQuery 673 page.injectJs('client/jquery.js'); 674 page.evaluate(function() { 675 window._pjs$ = jQuery.noConflict(true); 676 }); 677 // load pjscrape client-side code 678 page.injectJs('client/pjscrape_client.js'); 679 // reset the global jQuery vars 680 if (!opts.noConflict) { 681 page.evaluate(function() { 682 window.$ = window.jQuery = window._pjs$; 683 }); 684 } 685 // run scraper(s) 686 page.waitFor(opts.ready, function(page) { 687 if (page.evaluate(opts.scrapable)) { 688 // load script(s) if necessary 689 if (opts.loadScript) { 690 opts.loadScript.forEach(function(script) { 691 page.injectJs(script); 692 }) 693 } 694 // run prescrape 695 page.evaluate(opts.preScrape); 696 // run each scraper and send any results to writer 697 if (scrapers && scrapers.length) { 698 // set up callback manager 699 var i = 0; 700 function checkComplete() { 701 if (++i == scrapers.length) { 702 complete(page); 703 } 704 } 705 // run all scrapers 706 scrapers.forEach(function(scraper) { 707 if (isFunction(scraper)) { 708 // standard scraper 709 suite.addItem(page.evaluate(scraper)); 710 checkComplete(); 711 } else if (typeof scraper == 'string') { 712 // selector-only scraper 713 suite.addItem(page.evaluate(new Function( 714 "return _pjs.getText('" + scraper + "');" 715 ))); 716 checkComplete(); 717 } else if (scraper.scraper) { 718 // wrapped scraper, more options (just async now) 719 if (scraper.async) { 720 // start the scrape 721 page.evaluate(scraper.scraper); 722 // wait for the scraper to return items 723 page.waitFor( 724 function() { 725 return _pjs.items !== undefined 726 }, 727 function() { 728 suite.addItem(page.evaluate(function() { 729 return _pjs.items; 730 })); 731 checkComplete(); 732 } 733 ); 734 } 735 } 736 }); 737 } 738 } else { 739 complete(page); 740 } 741 }); 742 }); 743 } 744 }; 745 746 /** 747 * Run the set of configured scraper suites. 748 * @name pjs.init 749 */ 750 function init() { 751 // check requirements 752 if (!suites.length) fail('No suites configured'); 753 if (!(config.log in loggers)) fail('Could not find logger: "' + config.log + '"'); 754 if (!(config.writer in writers)) fail('Could not find writer "' + config.writer + '"'); 755 756 // init logger 757 log = new loggers[config.log](); 758 // init writer 759 writer = new writers[config.writer](log); 760 761 // init suite manager 762 SuiteManager.setComplete(function() { 763 // scrape complete 764 writer.finish(); 765 log.msg('Saved ' + writer.count() + ' items'); 766 phantom.exit(); 767 }); 768 // make all suites 769 suites.forEach(function(suite, i) { 770 SuiteManager.add(new ScraperSuite( 771 suite.title || "Suite " + i, 772 arrify(suite.url || suite.urls), 773 suite 774 )); 775 }); 776 // start the suite manager 777 SuiteManager.runNext(); 778 } 779 780 return { 781 init: init 782 } 783 }()); 784 785 // expose namespaces and API functions 786 return { 787 loggers: loggers, 788 formatters: formatters, 789 writers: writers, 790 hashFunctions: hashFunctions, 791 init: runner.init, 792 793 /** 794 * Set one or more config variables, applying to all suites 795 * @name pjs.config 796 * @param {String|Object} key Either a key to set or an object with 797 * multiple values to set 798 * @param {mixed} [val] Value to set if using config(key, val) syntax 799 */ 800 config: function(key, val) { 801 if (!key) { 802 return config; 803 } else if (typeof key == 'object') { 804 extend(config, key); 805 } else if (val) { 806 config[key] = val; 807 } 808 }, 809 810 /** 811 * Add one or more scraper suites to be run. 812 * @name pjs.addSuite 813 * @param {Object} suite Scraper suite configuration object 814 * @param {Object} [...] More suite configuration objects 815 */ 816 addSuite: function() { 817 suites = Array.prototype.concat.apply(suites, arguments); 818 }, 819 820 /** 821 * Shorthand function to add a simple scraper suite. 822 * @name pjs.addScraper 823 * @param {String|String[]} url URL or array of URLs to scrape 824 * @param {Function|Function[]} Scraper function or array of scraper functions 825 */ 826 addScraper: function(url, scraper) { 827 suites.push({url:url, scraper:scraper}); 828 } 829 }; 830 }()); 831 832 833 // make sure we have a config file 834 if (!phantom.args.length) { 835 // die 836 console.log('Usage: pjscrape.js <configfile.js> ...'); 837 phantom.exit(); 838 } else { 839 // load the config file(s) 840 phantom.args.forEach(function(configFile) { 841 if (!phantom.injectJs(configFile)) { 842 fail('Config file not found: ' + configFile); 843 } 844 }); 845 } 846 // start the scrape 847 pjs.init(); 848 849