1 /** 2 * @overview 3 * Client-side helpers for pjscrape 4 * @name pjscrape_client.js 5 */ 6 7 /** 8 * @namespace 9 * Namespace for client-side utility functions. This will be available 10 * to scrapers as <code>_pjs</code> or <code>window._pjs</code>. 11 * @name _pjs 12 */ 13 window._pjs = (function($) { 14 15 // munge the location 16 var loc = window.location, 17 base = loc.protocol + '//' + loc.hostname + (loc.port ? ':' + loc.port : ''), 18 path = loc.pathname.split('/').slice(0,-1).join('/') + '/'; 19 20 /** 21 * Check whether a URL is local to this site 22 * @name _pjs.isLocalUrl 23 * @param {String} url URL to check 24 * @return {Boolean} Whether this URL is local 25 */ 26 function isLocalUrl(url) { 27 return !url.match(/^(https?:\/\/|mailto:)/) || url.indexOf(base) === 0; 28 } 29 30 /** 31 * Convert a local URL to a fully qualified URL (with domain name, etc) 32 * @name _pjs.toFullUrl 33 * @param {String} url URL to convert 34 * @return {String} Fully qualified URL 35 */ 36 function toFullUrl(url) { 37 // non-existent, or fully qualified already 38 if (!url || url.indexOf(base) === 0 || !isLocalUrl(url)) return url; 39 // absolute url 40 if (url[0] == '/') return base + url; 41 // relative url - browser can figure out .. 42 return base + path + url; 43 } 44 45 /** 46 * Convenience function - find all anchor tags on the page matching the given 47 * selector (or jQuery object) and return an array of fully qualified URLs 48 * @name _pjs.getAnchorUrls 49 * @param {String|jQuery} selector Selector or jQuery object to find anchor elements 50 * @param {Boolean} includeOffsite Whether to include off-site links 51 * @return {String[]} Array of fully qualified URLs 52 */ 53 function getAnchorUrls(selector, includeOffsite) { 54 return $(selector).map(function() { 55 var href = $(this).attr('href'); 56 return (href && href.indexOf('#') !== 0 && (includeOffsite || isLocalUrl(href))) ? 57 toFullUrl(href) : undefined; 58 }).toArray(); 59 } 60 61 /** 62 * Convenience function - find all tags on the page matching the given 63 * selector (or jQuery object) and return inner text for each 64 * @name _pjs.getText 65 * @param {String|jQuery} selector Selector or jQuery object to find elements 66 * @return {String[]} Array of text contents 67 */ 68 function getText(selector) { 69 return $(selector).map(function() { 70 return $(this).text(); 71 }).toArray(); 72 } 73 74 /** 75 * Get a set of records by looking for repeated patterns in the .content() 76 * of the selected element. Patterns can be supplied as either objects or 77 * arrays; the record format will match the input format. Pattern pieces 78 * can be either selector strings, regular expressions, or "text" for 79 * unwrapped text blocks. Interstitial cruft (br tags, line breaks, etc) 80 * will be ignored if they don't have any text other than whitespace. 81 * Pattern pieces can also be specified as objects, in the format 82 * <code>{pattern: [pattern piece], ...options}</code>, in order to specify 83 * additional options. Available options are <code>optional</code> (boolean), 84 * <code>ignore</code> (boolean, require but don't return content), 85 * <code>inner</code> (boolean, scrape again in the previous element), 86 * <code>scrape</code> (custom function to scrape content from matched element), 87 * and <code>test</code> (custom function to test if element matches). 88 * @name _pjs.getPattern 89 * @param {String|jQuery} selector Selector or jQuery object to find elements 90 * @param {Object|Array} pattern Pattern to look for 91 * @return {Object[]|Array[]} Records in format matching the pattern format 92 */ 93 function getPattern(selector, pattern) { 94 var isArray = Array.isArray(pattern), 95 pieces = isArray ? pattern : [], 96 testBlank = function(el) { 97 return (/^\s*$/).test($(el).text()) 98 }, 99 output = [], 100 contents = $(selector).contents().toArray(), 101 prevPattern; 102 // quick fail 103 if (!contents.length) return []; 104 // set up pattern pieces 105 function makePiece(piece, key) { 106 if (typeof piece == 'object' && !(piece instanceof RegExp)) { 107 piece.key = key; 108 // inner pieces still need a pattern to match the current element 109 if (piece.inner) { 110 piece.pattern = prevPattern; 111 } 112 } else { 113 piece = { 114 key: key, 115 pattern: piece 116 } 117 } 118 // save for inner if necessary 119 prevPattern = piece.pattern; 120 // set scrape function, if not supplied 121 piece.scrape = piece.scrape || function(el) { 122 return $(el).text().trim() 123 } 124 // set test function 125 piece.test = piece.test || function(el) { 126 return piece.pattern == "text" ? // text node 127 el.nodeType == Node.TEXT_NODE && !testBlank(el) : 128 typeof piece.pattern == "string" ? // selector 129 $(el).is(piece.pattern) : 130 piece.pattern instanceof RegExp ? // regexp 131 piece.pattern.test($(el).text()) : false; 132 } 133 return piece; 134 } 135 // convert object to array 136 if (!isArray) { 137 for (var key in pattern) { 138 pieces.push(makePiece(pattern[key], key)) 139 } 140 } else { 141 // convert array to desired format 142 pieces = pieces.map(makePiece); 143 } 144 // quick exit #2 145 if (!pieces.length) return; 146 // create a state automaton 147 var state, collector; 148 function reset() { 149 state = 0, 150 collector = isArray ? [] : {}; 151 } 152 // save and reset if necessary 153 function checkReset() { 154 if (state >= pieces.length) { 155 output.push(collector); 156 reset(); 157 } 158 } 159 function step(el) { 160 if (testBlank(el)) return; 161 checkReset(); // check at the beginning for trailing optional 162 var piece = pieces[state]; 163 // check for match 164 if (piece.test(el)) { 165 // hit; scrape 166 if (!piece.ignore) { 167 collector[piece.key] = piece.scrape(el); 168 } 169 state++; 170 // lookahead for inner patterns 171 if (pieces[state] && pieces[state].inner) { 172 step(el); 173 } 174 } else if (piece.optional) { 175 // optional; advance 176 state++; 177 step(el); 178 } else if (state > 0) reset(); // miss; reset 179 checkReset(); 180 } 181 // iterate through the contents 182 reset(); 183 contents.forEach(step); 184 185 return output; 186 } 187 188 /** 189 * Wait for a condition to occur, then execute the callback 190 * @name _pjs.waitFor 191 * @param {Function} test Test function; should return true when ready 192 * @param {Function} callback Callback function to execute 193 */ 194 function waitFor(test, callback) { 195 var intervalId = window.setInterval(function() { 196 if (test()) { 197 window.clearInterval(intervalId); 198 callback(); 199 } 200 }, 100); 201 } 202 203 /** 204 * Wait for an element to appear, then execute the callback 205 * @name _pjs.waitForElement 206 * @param {String} selector JQuery selector to look for 207 * @param {Function} callback Callback function to execute 208 */ 209 function waitForElement(selector, callback) { 210 waitFor(function() { 211 return !!$(selector).length; 212 }, callback); 213 } 214 215 /** 216 * Flag that will be set to true when $(document).ready is called. 217 * Generally your code will not need to deal with this - use the "ready" 218 * configuration parameter instead. 219 * @type Boolean 220 * @name _pjs.ready 221 */ 222 223 return { 224 isLocalUrl: isLocalUrl, 225 toFullUrl: toFullUrl, 226 getAnchorUrls: getAnchorUrls, 227 getText: getText, 228 getPattern: getPattern, 229 waitFor: waitFor, 230 waitForElement: waitForElement, 231 /** 232 * Reference to jQuery. This is guaranteed to be 233 * the pjscrape.js version of the jQuery library. 234 * Scrapers using the 'noConflict' config option 235 * should use this reference in their code. 236 * @type jQuery 237 * @name _pjs.$ 238 */ 239 '$': $ 240 }; 241 }(_pjs$)); 242 243 // bind to .ready() 244 window._pjs.$(function() { 245 window._pjs.ready = true; 246 }); 247 248 // for reasons I can't fathom, omitting this line throws an 249 // error on pages with <input type="image">. Go figure. 250 console.log('___ Client-side code initialized');