BrowserView.js
[app.wkmirror] / BrowserView.js
1 Gtk = imports.gi.Gtk;
2 WebKit = imports.gi.WebKit;
3
4 TabbedBrowser = imports.TabbedBrowser;
5 File = imports.File.File;
6
7 base64 = imports.base64.base64;
8
9 BrowserView = new GType({
10     parent: WebKit.WebView.type,
11     name: "BrowserView",
12     init: function ()
13     {
14         // Private
15         
16         
17         var _t = this;
18         
19         var tab;
20         var browsePage = false;
21         var maxQueue = 0;
22         var injected = {};
23         
24         var storedir = '/tmp/wkqueue';
25         if (!File.exists(storedir)) {
26             File.mkdir(storedir);
27         }
28         var parsedir = storedir + '/parse_queue';
29         var downloaddir = storedir + '/download_queue';
30         var donedir = storedir + '/downloaded_queue';
31         if (!File.exists(parsedir)) {
32             File.mkdir(parsedir);
33         }
34         if (!File.exists(downloaddir)) {
35             File.mkdir(downloaddir);
36         }
37         if (!File.exists(donedir)) {
38             File.mkdir(donedir);
39         }
40         
41         var update_title = function (web_view, web_frame, title)
42         {
43             if(title.length > 25)
44                 title = title.slice(0,25) + "...";
45
46             tab.get_tab_label().label = title;
47         };
48
49         var update_url = function (web_view, web_frame)
50         {
51             var toolbar = tab.get_toolbar();
52
53             toolbar.set_url(web_frame.get_uri());
54             toolbar.set_can_go_back(web_view.can_go_back());
55             toolbar.set_can_go_forward(web_view.can_go_forward());
56         };
57
58         var update_progress = function (bar, progress)
59         {
60             tab.get_toolbar().set_progress(progress / 100);
61         };
62
63         var create_new_tab = function (web_view, web_frame, new_web_view)
64         {
65             new_web_view = new BrowserView();
66             new_web_view.signal.web_view_ready.connect(show_new_tab);
67             return new_web_view;
68         };
69
70         var show_new_tab = function (new_web_view)
71         {
72             TabbedBrowser.browser.new_tab("", new_web_view);
73
74             return false;
75         };
76
77         var hover_link = function (web_view, link, url)
78         {
79             tab.get_statusbar().set_status(url);
80         };
81
82         
83         
84         
85         this.add_inject = function(force)
86         {
87             
88             if (force || (typeof(injected[this.uri]) == 'undefined' )) {
89                 injected[this.uri] = 0;
90             }
91             if (injected[this.uri] > 2) {
92                 return;
93             }
94             injected[this.uri]++;
95             var fn = __script_path__ + "/inject.js";
96             if (File.exists(fn)) {
97                 print("Adding inject");
98                 var newjs = File.read(__script_path__ + "/inject.js");
99                 TabbedBrowser.browser.current_tab().get_web_view().execute_script(
100                     newjs
101                     
102                 );
103             }
104             
105         }
106          this.add_nsinject = function(force)
107         {
108             
109             //if (force || (typeof(injected[this.uri]) == 'undefined' )) {
110             //    injected[this.uri] = 0;
111             //}
112             //if (injected[this.uri] > 2) {
113             //    return;
114             //}
115             //injected[this.uri]++;
116             var fn = __script_path__ + "/nsinject.js";
117             if (File.exists(fn)) {
118                 print("Adding inject");
119                 var newjs = File.read(__script_path__ + "/nsinject.js");
120                 TabbedBrowser.browser.current_tab().get_web_view().execute_script(
121                     newjs
122                     
123                 );
124             }
125             
126         }
127         
128         
129         
130         
131         
132         var load_finished = function ()
133         {
134             print("load finished");
135             tab.get_toolbar().set_progress(0);
136             
137             
138             var pd = JSON.parse(File.read('/home/alan/.nspasswd'));
139             
140             
141             _t.add_inject(true);
142             _t.add_nsinject(true);     
143             TabbedBrowser.browser.current_tab().get_web_view().execute_script(
144                 'NS.login(' +  JSON.stringify(pd.username) + JSON.stringify(pd.password) + ' );'
145                 
146             );
147             
148             if (browsePage) {
149                 print("onload: calling gather links");
150                 TabbedBrowser.browser.current_tab().get_web_view().execute_script(
151                     'BrowserMirror.gatherlinks();'
152                     
153                 );
154                 
155             }
156             
157             
158         };
159
160         var load_committed = function (web_view, web_frame)
161         {
162             update_url(web_view, web_frame);
163         };
164
165         var clicked_link = function (web_view, web_frame, request,
166                                      action, decision, window)
167         {
168             if(action.get_reason() == WebKit.WebNavigationReason.LINK_CLICKED &&
169                action.get_button() == 2)
170             {
171                 browser.new_tab(request.get_uri(), null);
172                 return true;
173             }
174
175             return false;
176         };
177
178         // Public
179         
180         
181         this.browse = function (url)
182         {
183             if(url.search("://") < 0)
184                 url = "http://" + url;
185
186             this.open(url);
187         };
188
189         this.set_tab = function (new_tab)
190         {
191             tab = new_tab;
192         };
193
194         this.get_tab = function ()
195         {
196             return tab;
197         };
198
199
200         this.downloadqueue = function()
201         {
202             var filesList = File.list(downloaddir);
203             print("DOWNLOAD QUEUE LENGTH: " + filesList.length);
204             if (!maxQueue || maxQueue < filesList.length) {
205                 maxQueue  = filesList.length;
206             }
207             
208             tab.get_toolbar().set_progress(filesList.length / Math.max(maxQueue, filesList.length ));
209             if (filesList == null || filesList.length == 0  ){
210                 
211                 return false;
212             }  
213             var url = decodeURIComponent(filesList[0]);
214             if (!this.checkdomain(url)) {
215                 print("SKIP (external domain) : " + url);
216                 File.remove(downloaddir + '/' + filesList[0]);
217                 return this.downloadqueue();
218                 
219             }
220             this.downloadhead( url);
221             return true;
222             
223         };
224         
225     
226         this.queuerun = function()
227         {
228             if (this.downloadqueue()) return true;
229             return this.parsequeue();
230         };
231         
232         this.parsequeue = function(){
233             var filesList = File.list(parsedir);
234             if (filesList == null || filesList.length == 0  ){
235                 return false;
236             } 
237             
238             browsePage = decodeURIComponent(filesList[0]);
239             print("parsing page:" + browsePage );
240             this.browse(browsePage);
241             
242              
243             
244             
245             
246             return true;
247         };
248         
249         
250         this.downloadpage = function(link){
251             print("calling download page: " + link);
252             _t.add_inject(); // just in case..
253                 //var url = File.read(__script_path__+"/downloadqueue/"+link);
254             this.execute_script(
255                 "BrowserMirror.downloadpage(" + JSON.stringify(link) +");"
256             );          
257                 
258             
259         };
260  
261         this.downloadhead = function(link){
262             print("calling download head: " + link);
263             _t.add_inject(); // just in case..
264                 //var url = File.read(__script_path__+"/downloadqueue/"+link);
265             this.execute_script(
266                 "BrowserMirror.downloadhead(" + JSON.stringify(link) +");"
267             );          
268                  
269         };
270         var nsqueue = false;
271         this.nsdownloadNext = function()
272         {
273             if (nsqueue === false) {
274                 nsqueue = [
275                            //282,273,285,395,271,272,278,394,402,279,432,280,284,281,433,437,404,444,151,152,1,283,287,288,418,407,398,449,147,150,131,149,443,148,411,405,415,417,416,406,408,399,410,409,412,400,155,133,124,157,118,183,450,434,301,435,130,286,162,292,431,161,290,146,145,158,159,120,137,128,139,163,293,160,289,291,141,140,138,142,144,110,401,156,154,153,170,167,295,166,294,277,425,164,165,168,296,169,297,113,171,119,180,300,176,175,172,302,299,
276                            //298,421,187,307,428,275,269,304,181,182,174,173,184,303,112,179,114,305,177,178,186,306,403,420,111,109,309,188,308,446,445,310,311,53,189,52,312,419,270,190,191,192,193,54,194,108,268,121,195,206,396,205,207,324,220,340,212,331,219,339,222,342,436,209,327,424,208,326,210,328,330,325,211,329,218,337,338,224,223,217,336,214,333,213,332,216,335,221,341,215,334,197,314,
277                             //196,313,198,316,201,320,317,204,323,203,322,202,321,200,319,199,318,346,58,267,393,265,391,438,264,390,261,387,262,388,260,386,266,392,413,414,263,389,347,225,343,227,345,226,344,422,228,348,237,358,231,352,229,349,351,233,354,238,359,360,239,361,230,350,232,353,234,355,439,236,357,240,362,235,356,254,380,257,383,259,385,255,381,256,448,382,258,384,374,252,375,376,251,373,378,429,430,379,253,244,366,247,
278                            //369,363,242,364,241,243,365,246,368,245,367,377,427,
279                            426,250,447,372,249,371,248,370,442,397,129,441,440,116,122,123,125,126,136,115,117,127
280                         ];
281             }
282             
283             if (!nsqueue.length) {
284                 print("DONE");
285                 return;
286             }
287             var pg = nsqueue.shift() ;
288             
289             if ( File.exists( storedir+'/output/' + pg + '.csv')) {
290                 this.nsdownloadNext();
291                 return;
292             }
293             
294             this.execute_script(
295                 "NS.rungrab(" + pg +");"
296             );  
297             
298             
299         }
300
301         // Implementation
302         //this.set_scroll_adjustments(null, null);
303
304         this.signal.title_changed.connect(update_title);
305         this.signal.load_committed.connect(load_committed);
306         this.signal.load_finished.connect(load_finished);
307         this.signal.load_progress_changed.connect(update_progress);
308
309         // For some reason, this segfaults seed in the instance init closure handler
310         // Once that's fixed, uncommenting the next line will give middle-click-open-in-new tab
311         //this.signal.navigation_policy_decision_requested.connect(clicked_link);
312
313         this.signal.hovering_over_link.connect(hover_link);
314
315         this.signal.create_web_view.connect(create_new_tab);
316         
317          
318         print("ADDing console message sig handler");
319         
320         
321         
322         
323         this.signal.console_message.connect(function(wv, msg, line, sid) {
324             // print('BrowserView.js got ' + msg);
325             var methodname;
326             var ret;
327             try {
328                 ret = JSON.parse(msg);
329             
330             } catch(e) {
331                 print("GOT INVALID message:" + msg)
332                 return true;
333                 
334                 
335             }
336             print("got method : " + ret.method);
337             
338             if (ret.method == 'nsdownloadpage'){
339                 try { 
340                     var mt = ret.contentType.split(';').shift();
341                 } catch( e) {
342                     print("INVALID CONTENT TYPE? \n" + JSON.stringify(ret));
343                     mt='';
344                 }
345                 
346                 switch(mt) {
347                    
348                     case '':
349                          _t.nsdownloadNext();
350                        // _t.moveToDone(ret.requesturl);
351                         break;
352                     
353                     default:
354                         var info = false;
355                         var info_f = _t.dupeCheck(ret.requesturl);
356                         if (info_f) {
357                             info  = JSON.parse(File.read(info_f));
358                         }
359                          var target  = storedir+'/output/' + ret.filename 
360                          // flag it as done..
361                        
362                         
363                         
364                         
365                         
366                         //File.write(target ,decodeURIComponent(escape(base64.decode( ret.data))));
367                         print("GOT array sized: " + ret.data.length);
368                         
369                         File.writeBinaryArray(target,ret.data);
370                         print("Wrote to file: " + target);
371                         // get next..
372                         _t.nsdownloadNext();
373                         
374                         
375                         
376                         break;
377                 }
378                 return true; //???
379             }
380             
381             if (ret.method == 'gatherlinks'){
382                 // flag the page as parsed.
383                 
384                 if (browsePage) {
385                     _t.moveToDone(browsePage);
386                     
387                 }
388                 var sourcePage = browsePage;
389                 browsePage = false;
390                 print(typeof(ret.data));
391                 if (typeof(ret.data) != 'object' ) {
392                     print("GOT INVALID DATA?:" + JSON.stringify(ret,null,4));
393                     ret.data= [];
394                     
395                 }
396                 ret.data.forEach(function(ln) {
397                     if (!ln.href.match(/^http[s]*:\/\//)) {
398                         print("SKIP link: " + ln.href);
399                         return;
400                     }
401                     ln.href= ln.href.replace(/#.*$/,'');
402                     
403                     if (!_t.checkdomain(ln.href) ) {
404                         print("SKIP link (external domain): " + ln.href);
405                         return;
406                     }
407                     
408                     // this is just for our purposes..
409                     
410                     if (ln.href.match(/\/pages\/[0-9]+$/)) {
411                         print("SKIP link (ingore unnamed pages): " + ln.href);
412                         return;
413                     }
414                     
415                     
416                     
417                     var fn = encodeURIComponent(ln.href);
418                     
419                     var dupe  = _t.dupeCheck(ln.href);
420                     if (dupe) {
421                         
422                         if (dupe == downloaddir + '/'  + fn) {
423                             var info = JSON.parse(File.read(dupe));
424                             if (info && info.fromUrl && info.fromUrl.length > sourcePage.length) {
425                                 print("SKIP link (in queue already): " + ln.href);
426                                 return;
427                             }
428                             print("found a longer link for url")
429                         } else {
430                             print("SKIP link (in another queue): " + ln.href);
431                             return;
432                         }
433                     }
434                      
435                     File.write(downloaddir + '/'  + fn, JSON.stringify( {
436                         label : ln.label,
437                         fromURL : sourcePage
438                     })); // write an empyt file indicating it needs downloading..
439                 });         
440                 var filesList = File.list(downloaddir);
441             
442                 maxQueue =  filesList.length;
443             }
444             
445             if (ret.method == 'downloadpage'){
446                 // got the results from download page:
447                 // requesturl 
448                 // remove from downloadqueue.
449              
450                 
451                 //
452                 try { 
453                     var mt = ret.contentType.split(';').shift();
454                 } catch( e) {
455                     print("INVALID CONTENT TYPE? \n" + JSON.stringify(ret));
456                     mt='';
457                 }
458                 
459                 switch(mt) {
460                    
461                     case '':
462                         _t.moveToDone(ret.requesturl);
463                         break;
464                     
465                     default:
466                         var info = false;
467                         var info_f = _t.dupeCheck(ret.requesturl);
468                         if (info_f) {
469                             info  = JSON.parse(File.read(info_f));
470                         }
471                     
472                         _t.moveToDone(ret.requesturl);
473                         // flag it as done..
474                        
475                         // write it...
476                         var target  = _t.toFilename(ret.requesturl);
477                         if (info && info.fromURL) {
478                             var bn = decodeURIComponent(File.basename(target));
479                             target = _t.toFilename(info.fromURL+'/'+ bn);
480                         }
481                         
482                         
483                         
484                         
485                         //File.write(target ,decodeURIComponent(escape(base64.decode( ret.data))));
486                         print("GOT array sized: " + ret.data.length);
487                         
488                         File.writeBinaryArray(target,ret.data);
489                         print("Wrote to file: " + target);
490                         break;
491                 }
492                 
493                 // if it's HTML then add it to parse queue
494                 // otehrwise save it.. and run the queue again.
495                
496                 
497                 //File.write(storedir+"/parsequeue/"+Math.random(), msg);
498             }
499             
500            
501             if (ret.method == 'downloadhead'){
502                 // got the results from download page:
503                 // requesturl 
504                 // remove from downloadqueue.
505                 
506                 //
507                 try { 
508                     var mt = ret.contentType.split(';').shift();
509                 } catch( e) {
510                     print("INVALID CONTENT TYPE? \n" + JSON.stringify(ret));
511                     mt='';
512                 }
513                 
514                 switch(mt) {
515                     case 'text/html':
516                         // add to parse QUEUE..
517                         print("moving to parse queue");
518                         _t.moveToParse(ret.requesturl);
519                         break;
520                     // stuf we do not care about..
521                     case 'application/atom+xml':
522                     case '':
523                         print("moving to done queue");
524                          _t.moveToDone(ret.requesturl);
525                         break;
526                     
527                     default:
528                         print("calling download file");
529                         _t.downloadpage( ret.requesturl );
530                         // keep it on the queue..
531                         // do not run the queue..
532                         return true;
533                      
534                 }
535                 
536                 // if it's HTML then add it to parse queue
537                 // otehrwise save it.. and run the queue again.
538                
539                 
540                 //File.write(storedir+"/parsequeue/"+Math.random(), msg);
541             }
542             _t.queuerun();
543             
544             return true;
545         });
546         
547         
548         this.toFilename = function(url)
549         {
550             url = url.replace(/^http[s]*:\/\//, '');
551             var p = url.split('/');
552             p.unshift(storedir+'/output');
553             for (var i =1 ;i < p.length; i++) {
554                 p[i] = encodeURIComponent(p[i]);
555             
556             }
557             p[p.length-1] = decodeURIComponent(p[p.length-1]);
558             ret = p.join('/');
559             var dir = File.dirname(ret);
560             File.mkdirall(dir);
561             return ret;
562             
563         }
564         this.checkdomain = function(comp)
565         {
566             var b = parseUri(this.uri);
567             var d = parseUri(comp);
568             return (d.host == b.host && d.protocol == b.protocol);
569             
570             
571         }
572         
573         this.dupeCheck = function(url)
574         {
575             
576            // order - return highest up the queue first..
577             if (File.exists(downloaddir +'/' + encodeURIComponent(url))) {
578                 return downloaddir +'/' + encodeURIComponent(url);
579             }
580              if (File.exists(parsedir +'/' + encodeURIComponent(url))) {
581                 return parsedir +'/' + encodeURIComponent(url);
582             }
583             if (File.exists(donedir +'/' + encodeURIComponent(url))) {
584                 return donedir +'/' + encodeURIComponent(url);
585             }
586             return  false;
587             
588             
589         }
590         this.moveToParse = function(url)
591         {
592             var old = this.dupeCheck(url);
593             var target =parsedir +'/' + encodeURIComponent(url);
594             if (old == target) {
595                 return;
596             }
597             File.write(target, old ? File.read(old) : '');
598             if (old) {
599                 File.remove(old);
600             }
601             
602         }
603         
604         this.moveToDownload= function(url)
605         {
606             var old = this.dupeCheck(url);
607             var target =downloaddir +'/' + encodeURIComponent(url);
608             if (old == target) {
609                 return;
610             }
611             File.write(target, old ? File.read(old) : '');
612             if (old) {
613                 File.remove(old);
614             }
615             
616         }
617         this.moveToDone= function(url)
618         {
619             var old = this.dupeCheck(url);
620             var target = donedir +'/' + encodeURIComponent(url);
621             if (old == target) {
622                 return;
623             }
624             File.write(target, old ? File.read(old) : '');
625             if (old) {
626                 File.remove(old);
627             }
628             
629         }
630         
631     }
632 });
633
634 function parseUri (str) {
635         var     o   = parseUri.options,
636                 m   = o.parser[o.strictMode ? "strict" : "loose"].exec(str),
637                 uri = {},
638                 i   = 14;
639
640         while (i--) uri[o.key[i]] = m[i] || "";
641
642         uri[o.q.name] = {};
643         uri[o.key[12]].replace(o.q.parser, function ($0, $1, $2) {
644                 if ($1) uri[o.q.name][$1] = $2;
645         });
646
647         return uri;
648 };
649
650 parseUri.options = {
651         strictMode: false,
652         key: ["source","protocol","authority","userInfo","user","password","host","port","relative","path","directory","file","query","anchor"],
653         q:   {
654                 name:   "queryKey",
655                 parser: /(?:^|&)([^&=]*)=?([^&]*)/g
656         },
657         parser: {
658                 strict: /^(?:([^:\/?#]+):)?(?:\/\/((?:(([^:@]*)(?::([^:@]*))?)?@)?([^:\/?#]*)(?::(\d*))?))?((((?:[^?#\/]*\/)*)([^?#]*))(?:\?([^#]*))?(?:#(.*))?)/,
659                 loose:  /^(?:(?![^:@]+:[^:@\/]*@)([^:\/?#.]+):)?(?:\/\/)?((?:(([^:@]*)(?::([^:@]*))?)?@)?([^:\/?#]*)(?::(\d*))?)(((\/(?:[^?#](?![^?#\/]*\.[^?#\/.]+(?:[?#]|$)))*\/?)?([^?#\/]*))(?:\?([^#]*))?(?:#(.*))?)/
660         }
661 };