1 /**
  2  * @overview
  3  * Client-side helpers for pjscrape
  4  * @name pjscrape_client.js
  5  */
  6 
  7 /**
  8  * @namespace
  9  * Namespace for client-side utility functions. This will be available
 10  * to scrapers as <code>_pjs</code> or <code>window._pjs</code>.
 11  * @name _pjs
 12  */
 13 window._pjs = (function($) {
 14     
 15     // munge the location
 16     var loc = window.location,
 17         base = loc.protocol + '//' + loc.hostname + (loc.port ? ':' + loc.port : ''),
 18         path = loc.pathname.split('/').slice(0,-1).join('/') + '/';
 19     
 20     /**
 21      * Check whether a URL is local to this site
 22      * @name _pjs.isLocalUrl
 23      * @param {String} url      URL to check
 24      * @return {Boolean}        Whether this URL is local
 25      */
 26     function isLocalUrl(url) {
 27         return !url.match(/^(https?:\/\/|mailto:)/) || url.indexOf(base) === 0;
 28     }
 29     
 30     /**
 31      * Convert a local URL to a fully qualified URL (with domain name, etc)
 32      * @name _pjs.toFullUrl
 33      * @param {String} url      URL to convert
 34      * @return {String}         Fully qualified URL
 35      */
 36     function toFullUrl(url) {
 37         // non-existent, or fully qualified already
 38         if (!url || url.indexOf(base) === 0 || !isLocalUrl(url)) return url;
 39         // absolute url
 40         if (url[0] == '/') return base + url;
 41         // relative url - browser can figure out ..
 42         return base + path + url;
 43     }
 44     
 45     /**
 46      * Convenience function - find all anchor tags on the page matching the given
 47      * selector (or jQuery object) and return an array of fully qualified URLs
 48      * @name _pjs.getAnchorUrls
 49      * @param {String|jQuery} selector      Selector or jQuery object to find anchor elements
 50      * @param {Boolean} includeOffsite      Whether to include off-site links
 51      * @return {String[]}                   Array of fully qualified URLs
 52      */
 53     function getAnchorUrls(selector, includeOffsite) {
 54         return $(selector).map(function() {
 55             var href = $(this).attr('href');
 56             return (href && href.indexOf('#') !== 0 && (includeOffsite || isLocalUrl(href))) ? 
 57                 toFullUrl(href) : undefined;
 58         }).toArray();
 59     }
 60     
 61     /**
 62      * Convenience function - find all tags on the page matching the given
 63      * selector (or jQuery object) and return inner text for each
 64      * @name _pjs.getText
 65      * @param {String|jQuery} selector      Selector or jQuery object to find elements
 66      * @return {String[]}                   Array of text contents
 67      */
 68     function getText(selector) {
 69         return $(selector).map(function() {
 70             return $(this).text();
 71         }).toArray();
 72     }
 73     
 74     /**
 75      * Get a set of records by looking for repeated patterns in the .content()
 76      * of the selected element. Patterns can be supplied as either objects or
 77      * arrays; the record format will match the input format. Pattern pieces
 78      * can be either selector strings, regular expressions, or "text" for
 79      * unwrapped text blocks. Interstitial cruft (br tags, line breaks, etc)
 80      * will be ignored if they don't have any text other than whitespace.
 81      * Pattern pieces can also be specified as objects, in the format
 82      * <code>{pattern: [pattern piece], ...options}</code>, in order to specify 
 83      * additional options. Available options are <code>optional</code> (boolean), 
 84      * <code>ignore</code> (boolean, require but don't return content),
 85      * <code>inner</code> (boolean, scrape again in the previous element),
 86      * <code>scrape</code> (custom function to scrape content from matched element), 
 87      * and <code>test</code> (custom function to test if element matches).
 88      * @name _pjs.getPattern
 89      * @param {String|jQuery} selector      Selector or jQuery object to find elements
 90      * @param {Object|Array} pattern        Pattern to look for
 91      * @return {Object[]|Array[]}           Records in format matching the pattern format
 92      */
 93     function getPattern(selector, pattern) {
 94         var isArray = Array.isArray(pattern),
 95             pieces = isArray ? pattern :  [],
 96             testBlank = function(el) {
 97                 return (/^\s*$/).test($(el).text())
 98             },
 99             output = [],
100             contents = $(selector).contents().toArray(),
101             prevPattern;
102         // quick fail
103         if (!contents.length) return [];
104         // set up pattern pieces
105         function makePiece(piece, key) {
106             if (typeof piece == 'object' && !(piece instanceof RegExp)) {
107                 piece.key = key;
108                 // inner pieces still need a pattern to match the current element
109                 if (piece.inner) {
110                     piece.pattern = prevPattern;
111                 }
112             } else {
113                 piece = {
114                     key: key,
115                     pattern: piece
116                 }
117             }
118             // save for inner if necessary
119             prevPattern = piece.pattern;
120             // set scrape function, if not supplied
121             piece.scrape = piece.scrape || function(el) {
122                 return $(el).text().trim()
123             }
124             // set test function
125             piece.test = piece.test || function(el) {
126                 return piece.pattern == "text" ? // text node
127                         el.nodeType == Node.TEXT_NODE && !testBlank(el) :
128                     typeof piece.pattern == "string" ? // selector
129                         $(el).is(piece.pattern) :
130                     piece.pattern instanceof RegExp ? // regexp
131                         piece.pattern.test($(el).text()) : false;
132             }
133             return piece;
134         }
135         // convert object to array
136         if (!isArray) {
137             for (var key in pattern) {
138                 pieces.push(makePiece(pattern[key], key))
139             }
140         } else {
141             // convert array to desired format
142             pieces = pieces.map(makePiece);
143         }
144         // quick exit #2
145         if (!pieces.length) return;
146         // create a state automaton
147         var state, collector;
148         function reset() {
149             state = 0,
150             collector = isArray ? [] : {};
151         }
152         // save and reset if necessary
153         function checkReset() {
154             if (state >= pieces.length) {
155                 output.push(collector);
156                 reset();
157             }
158         }
159         function step(el) {
160             if (testBlank(el)) return;
161             checkReset(); // check at the beginning for trailing optional
162             var piece = pieces[state];
163             // check for match
164             if (piece.test(el)) {
165                 // hit; scrape
166                 if (!piece.ignore) {
167                     collector[piece.key] = piece.scrape(el);
168                 }
169                 state++;
170                 // lookahead for inner patterns
171                 if (pieces[state] && pieces[state].inner) {
172                     step(el);
173                 }
174             } else if (piece.optional) {
175                 // optional; advance
176                 state++;
177                 step(el);
178             } else if (state > 0) reset(); // miss; reset
179             checkReset();
180         }
181         // iterate through the contents
182         reset();
183         contents.forEach(step);
184         
185         return output;
186     }
187     
188     /**
189      * Wait for a condition to occur, then execute the callback
190      * @name _pjs.waitFor
191      * @param {Function} test       Test function; should return true when ready
192      * @param {Function} callback   Callback function to execute
193      */
194     function waitFor(test, callback) {
195         var intervalId = window.setInterval(function() {
196             if (test()) {
197                 window.clearInterval(intervalId);
198                 callback();
199             }
200         }, 100);
201     }
202     
203     /**
204      * Wait for an element to appear, then execute the callback
205      * @name _pjs.waitForElement
206      * @param {String} selector     JQuery selector to look for
207      * @param {Function} callback   Callback function to execute
208      */
209     function waitForElement(selector, callback) {
210         waitFor(function() {
211             return !!$(selector).length;
212         }, callback);
213     }
214     
215     /**
216      * Flag that will be set to true when $(document).ready is called. 
217      * Generally your code will not need to deal with this - use the "ready"
218      * configuration parameter instead.
219      * @type Boolean
220      * @name _pjs.ready
221      */
222     
223     return {
224         isLocalUrl: isLocalUrl,
225         toFullUrl: toFullUrl,
226         getAnchorUrls: getAnchorUrls,
227         getText: getText,
228         getPattern: getPattern,
229         waitFor: waitFor,
230         waitForElement: waitForElement,
231         /**
232          * Reference to jQuery. This is guaranteed to be
233          * the pjscrape.js version of the jQuery library.
234          * Scrapers using the 'noConflict' config option 
235          * should use this reference in their code.
236          * @type jQuery
237          * @name _pjs.$
238          */
239         '$': $
240     };
241 }(_pjs$));
242 
243 // bind to .ready()
244 window._pjs.$(function() {
245     window._pjs.ready = true;
246 });
247 
248 // for reasons I can't fathom, omitting this line throws an
249 // error on pages with <input type="image">. Go figure.
250 console.log('___ Client-side code initialized');