BrowserView.js
[app.wkmirror] / BrowserView.js
1 Gtk = imports.gi.Gtk;
2 WebKit = imports.gi.WebKit;
3
4 TabbedBrowser = imports.TabbedBrowser;
5 File = imports.File.File;
6
7 base64 = imports.base64.base64;
8
9 BrowserView = new GType({
10     parent: WebKit.WebView.type,
11     name: "BrowserView",
12     init: function ()
13     {
14         // Private
15         
16         
17         var _t = this;
18         
19         var tab;
20         var browsePage = false;
21         var maxQueue = 0;
22         var injected = {};
23         
24         var storedir = '/home/alan/wkqueue';
25         if (!File.exists(storedir)) {
26             File.mkdir(storedir);
27         }
28         var parsedir = storedir + '/parse_queue';
29         var downloaddir = storedir + '/download_queue';
30         var donedir = storedir + '/downloaded_queue';
31         if (!File.exists(parsedir)) {
32             File.mkdir(parsedir);
33         }
34         if (!File.exists(downloaddir)) {
35             File.mkdir(downloaddir);
36         }
37         if (!File.exists(donedir)) {
38             File.mkdir(donedir);
39         }
40         
41         var update_title = function (web_view, web_frame, title)
42         {
43             if(title.length > 25)
44                 title = title.slice(0,25) + "...";
45
46             tab.get_tab_label().label = title;
47         };
48
49         var update_url = function (web_view, web_frame)
50         {
51             var toolbar = tab.get_toolbar();
52
53             toolbar.set_url(web_frame.get_uri());
54             toolbar.set_can_go_back(web_view.can_go_back());
55             toolbar.set_can_go_forward(web_view.can_go_forward());
56         };
57
58         var update_progress = function (bar, progress)
59         {
60             tab.get_toolbar().set_progress(progress / 100);
61         };
62
63         var create_new_tab = function (web_view, web_frame, new_web_view)
64         {
65             new_web_view = new BrowserView();
66             new_web_view.signal.web_view_ready.connect(show_new_tab);
67             return new_web_view;
68         };
69
70         var show_new_tab = function (new_web_view)
71         {
72             TabbedBrowser.browser.new_tab("", new_web_view);
73
74             return false;
75         };
76
77         var hover_link = function (web_view, link, url)
78         {
79             tab.get_statusbar().set_status(url);
80         };
81
82         
83         
84         
85         this.add_inject = function(force)
86         {
87             
88             if (force || (typeof(injected[this.uri]) == 'undefined' )) {
89                 injected[this.uri] = 0;
90             }
91             if (injected[this.uri] > 2) {
92                 return;
93             }
94             injected[this.uri]++;
95             var fn = __script_path__ + "/inject.js";
96             if (File.exists(fn)) {
97                 print("Adding inject");
98                 var newjs = File.read(__script_path__ + "/inject.js");
99                 TabbedBrowser.browser.current_tab().get_web_view().execute_script(
100                     newjs
101                     
102                 );
103             }
104             
105         }
106          this.add_nsinject = function(force)
107         {
108             
109             //if (force || (typeof(injected[this.uri]) == 'undefined' )) {
110             //    injected[this.uri] = 0;
111             //}
112             //if (injected[this.uri] > 2) {
113             //    return;
114             //}
115             //injected[this.uri]++;
116             var fn = __script_path__ + "/nsinject.js";
117             //if (File.exists(fn)) {
118                 print("Adding inject");
119                 var newjs = File.read(__script_path__ + "/nsinject.js");
120                 TabbedBrowser.browser.current_tab().get_web_view().execute_script(
121                     newjs
122                     
123                 );
124             //}
125             
126         }
127         
128         
129         
130         
131         
132         var load_finished = function ()
133         {
134             print("load finished");
135             tab.get_toolbar().set_progress(0);
136             
137             
138             
139             
140             _t.add_inject(true);
141           
142             _t.add_nsinject(true);
143               print(typeof(Seed.argv[2]));
144             if (Seed.argv[2] == null) {
145                 print("NEED ID!");
146                 Seed.quit();
147             
148             }
149             _t.nsdownloadNext(Seed.argv[2]  * 1)
150           
151             
152             if (browsePage) {
153                 print("onload: calling gather links");
154                 TabbedBrowser.browser.current_tab().get_web_view().execute_script(
155                     'BrowserMirror.gatherlinks();'
156                     
157                 );
158                 
159             }
160             
161             
162         };
163
164         var load_committed = function (web_view, web_frame)
165         {
166             update_url(web_view, web_frame);
167         };
168
169         var clicked_link = function (web_view, web_frame, request,
170                                      action, decision, window)
171         {
172             if(action.get_reason() == WebKit.WebNavigationReason.LINK_CLICKED &&
173                action.get_button() == 2)
174             {
175                 browser.new_tab(request.get_uri(), null);
176                 return true;
177             }
178
179             return false;
180         };
181
182         // Public
183         
184         
185         this.browse = function (url)
186         {
187             if(url.search("://") < 0)
188                 url = "http://" + url;
189
190             this.open(url);
191         };
192
193         this.set_tab = function (new_tab)
194         {
195             tab = new_tab;
196         };
197
198         this.get_tab = function ()
199         {
200             return tab;
201         };
202
203
204         this.downloadqueue = function()
205         {
206             var filesList = File.list(downloaddir);
207             print("DOWNLOAD QUEUE LENGTH: " + filesList.length);
208             if (!maxQueue || maxQueue < filesList.length) {
209                 maxQueue  = filesList.length;
210             }
211             
212             tab.get_toolbar().set_progress(filesList.length / Math.max(maxQueue, filesList.length ));
213             if (filesList == null || filesList.length == 0  ){
214                 
215                 return false;
216             }  
217             var url = decodeURIComponent(filesList[0]);
218             if (!this.checkdomain(url)) {
219                 print("SKIP (external domain) : " + url);
220                 File.remove(downloaddir + '/' + filesList[0]);
221                 return this.downloadqueue();
222                 
223             }
224             this.downloadhead( url);
225             return true;
226             
227         };
228         
229     
230         this.queuerun = function()
231         {
232             if (this.downloadqueue()) return true;
233             return this.parsequeue();
234         };
235         
236         this.parsequeue = function(){
237             var filesList = File.list(parsedir);
238             if (filesList == null || filesList.length == 0  ){
239                 return false;
240             } 
241             
242             browsePage = decodeURIComponent(filesList[0]);
243             print("parsing page:" + browsePage );
244             this.browse(browsePage);
245             
246              
247             
248             
249             
250             return true;
251         };
252         
253         
254         this.downloadpage = function(link){
255             print("calling download page: " + link);
256             _t.add_inject(); // just in case..
257                 //var url = File.read(__script_path__+"/downloadqueue/"+link);
258             this.execute_script(
259                 "BrowserMirror.downloadpage(" + JSON.stringify(link) +");"
260             );          
261                 
262             
263         };
264  
265         this.downloadhead = function(link){
266             print("calling download head: " + link);
267             _t.add_inject(); // just in case..
268                 //var url = File.read(__script_path__+"/downloadqueue/"+link);
269             this.execute_script(
270                 "BrowserMirror.downloadhead(" + JSON.stringify(link) +");"
271             );          
272                  
273         };
274         var nsqueue = false;
275         this.nsdownloadNext = function()
276         {
277             /*if (nsqueue === false) {
278                 nsqueue = [
279                            //282,273,285,395,271,272,278,394,402,279,432,280,284,281,433,437,404,444,151,152,1,283,287,288,418,407,398,449,147,150,131,149,443,148,411,405,415,417,416,406,408,399,410,409,412,400,155,133,124,157,118,183,450,434,301,435,130,286,162,292,431,161,290,146,145,158,159,120,137,128,139,163,293,160,289,291,141,140,138,142,144,110,401,156,154,153,170,167,295,166,294,277,425,164,165,168,296,169,297,113,171,119,180,300,176,175,172,302,299,
280                            //298,421,187,307,428,275,269,304,181,182,174,173,184,303,112,179,114,305,177,178,186,306,403,420,111,109,309,188,308,446,445,310,311,53,189,52,312,419,270,190,191,192,193,54,194,108,268,121,195,206,396,205,207,324,220,340,212,331,219,339,222,342,436,209,327,424,208,326,210,328,330,325,211,329,218,337,338,224,223,217,336,214,333,213,332,216,335,221,341,215,334,197,314,
281                             //196,313,198,316,201,320,317,204,323,203,322,202,321,200,319,199,318,346,58,267,393,265,391,438,264,390,261,387,262,388,260,386,266,392,413,414,263,389,347,225,343,227,345,226,344,422,228,348,237,358,231,352,229,349,351,233,354,238,359,360,239,361,230,350,232,353,234,355,439,236,357,240,362,235,356,254,380,257,383,259,385,255,381,256,448,382,258,384,374,252,375,376,251,373,378,429,430,379,253,244,366,247,
282                            //369,363,242,364,241,243,365,246,368,245,367,377,427,
283                            426,250,447,372,249,371,248,370,442,397,129,441,440,116,122,123,125,126,136,115,117,127
284                         ];
285             }
286             
287             if (!nsqueue.length) {
288                 print("DONE");
289                 return;
290             }
291             var pg = nsqueue.shift() ;
292             
293             
294             */
295             
296             //print(typeof(Seed.argv[2]));
297             if (!Seed.argv[2]) {
298                 print("NEED ID!");
299                 Seed.quit();
300             
301             }
302             var pg = Seed.argv[2]  * 1;
303             if ( File.exists( storedir+'/output/' + pg + '.csv')) {
304                 print("DONE : " + storedir+'/output/' + pg + '.csv');
305                 Seed.quit();
306                 return;
307             }
308             
309             var pd = JSON.parse(File.read('/home/alan/.nspasswd'));
310             
311             print("downloadnext : " + pg);
312             this.execute_script(
313                 'NS.login(' +  JSON.stringify(pd.username) + ',' + JSON.stringify(pd.password) + ',' + pg + ' );'
314                 
315             );
316             
317             
318             
319         }
320
321         // Implementation
322         //this.set_scroll_adjustments(null, null);
323
324         this.signal.title_changed.connect(update_title);
325         this.signal.load_committed.connect(load_committed);
326         this.signal.load_finished.connect(load_finished);
327         this.signal.load_progress_changed.connect(update_progress);
328
329         this.signal.create_plugin_widget.connect(function(wv, mt, uri, parm) { 
330                 print("got plugin requrest");
331         });
332
333 var mv = this.signal.webkit_web_view_get_main_frame();
334 //var db = WebKit.get_web_plugin_database ();
335 //var pg = db.get_plugins();
336 print(mv);
337 print("plugins");
338 //for(var i = 0 ; i < pg.length; i++) {
339
340 //print(pg[i].get_name());
341 //print(pg[i].get_enabled() *1);
342 //print(pg[i].get_description() *1);
343
344 //}
345         // For some reason, this segfaults seed in the instance init closure handler
346         // Once that's fixed, uncommenting the next line will give middle-click-open-in-new tab
347         //this.signal.navigation_policy_decision_requested.connect(clicked_link);
348
349         this.signal.hovering_over_link.connect(hover_link);
350
351         this.signal.create_web_view.connect(create_new_tab);
352         
353          
354         print("ADDing console message sig handler");
355         
356         
357         
358         
359         this.signal.console_message.connect(function(wv, msg, line, sid) {
360             // print('BrowserView.js got ' + msg);
361             var methodname;
362             var ret;
363             try {
364                 ret = JSON.parse(msg);
365             
366             } catch(e) {
367                 print("GOT INVALID message:" + msg)
368                 return true;
369                 
370                 
371             }
372             print("got method : " + ret.method);
373             
374             if (ret.method == 'exit') {
375                 Seed.quit();
376             }
377             
378             
379             if (ret.method == 'nsdownloadpage'){
380                 try { 
381                     var mt = ret.contentType.split(';').shift();
382                 } catch( e) {
383                     print("INVALID CONTENT TYPE? \n" + JSON.stringify(ret));
384                     mt='';
385                 }
386                 
387                 switch(mt) {
388                    
389                     case '':
390                         print("CONSOLE GOT BLANK - call download next?");
391                           Seed.quit();
392                          //_t.nsdownloadNext();
393                        // _t.moveToDone(ret.requesturl);
394                         break;
395                     
396                     default:
397                         var info = false;
398                         var info_f = _t.dupeCheck(ret.requesturl);
399                         if (info_f) {
400                             info  = JSON.parse(File.read(info_f));
401                         }
402                          var target  = storedir+'/output/' + ret.filename 
403                          // flag it as done..
404                        
405                         
406                         
407                         
408                         
409                         //File.write(target ,decodeURIComponent(escape(base64.decode( ret.data))));
410                         print("GOT array sized: " + ret.data.length);
411                         
412                         File.writeBinaryArray(target,ret.data);
413                         print("Wrote to file: " + target);
414                         // get next..
415                         return true;
416                         //Seed.quit();
417                         //_t.nsdownloadNext();
418                         
419                         
420                         
421                         break;
422                 }
423                 return true; //???
424             }
425             
426             if (ret.method == 'gatherlinks'){
427                 // flag the page as parsed.
428                 
429                 if (browsePage) {
430                     _t.moveToDone(browsePage);
431                     
432                 }
433                 var sourcePage = browsePage;
434                 browsePage = false;
435                 print(typeof(ret.data));
436                 if (typeof(ret.data) != 'object' ) {
437                     print("GOT INVALID DATA?:" + JSON.stringify(ret,null,4));
438                     ret.data= [];
439                     
440                 }
441                 ret.data.forEach(function(ln) {
442                     if (!ln.href.match(/^http[s]*:\/\//)) {
443                         print("SKIP link: " + ln.href);
444                         return;
445                     }
446                     ln.href= ln.href.replace(/#.*$/,'');
447                     
448                     if (!_t.checkdomain(ln.href) ) {
449                         print("SKIP link (external domain): " + ln.href);
450                         return;
451                     }
452                     
453                     // this is just for our purposes..
454                     
455                     if (ln.href.match(/\/pages\/[0-9]+$/)) {
456                         print("SKIP link (ingore unnamed pages): " + ln.href);
457                         return;
458                     }
459                     
460                     
461                     
462                     var fn = encodeURIComponent(ln.href);
463                     
464                     var dupe  = _t.dupeCheck(ln.href);
465                     if (dupe) {
466                         
467                         if (dupe == downloaddir + '/'  + fn) {
468                             var info = JSON.parse(File.read(dupe));
469                             if (info && info.fromUrl && info.fromUrl.length > sourcePage.length) {
470                                 print("SKIP link (in queue already): " + ln.href);
471                                 return;
472                             }
473                             print("found a longer link for url")
474                         } else {
475                             print("SKIP link (in another queue): " + ln.href);
476                             return;
477                         }
478                     }
479                      
480                     File.write(downloaddir + '/'  + fn, JSON.stringify( {
481                         label : ln.label,
482                         fromURL : sourcePage
483                     })); // write an empyt file indicating it needs downloading..
484                 });         
485                 var filesList = File.list(downloaddir);
486             
487                 maxQueue =  filesList.length;
488             }
489             
490             if (ret.method == 'downloadpage'){
491                 // got the results from download page:
492                 // requesturl 
493                 // remove from downloadqueue.
494              
495                 
496                 //
497                 try { 
498                     var mt = ret.contentType.split(';').shift();
499                 } catch( e) {
500                     print("INVALID CONTENT TYPE? \n" + JSON.stringify(ret));
501                     mt='';
502                 }
503                 
504                 switch(mt) {
505                    
506                     case '':
507                         _t.moveToDone(ret.requesturl);
508                         break;
509                     
510                     default:
511                         var info = false;
512                         var info_f = _t.dupeCheck(ret.requesturl);
513                         if (info_f) {
514                             info  = JSON.parse(File.read(info_f));
515                         }
516                     
517                         _t.moveToDone(ret.requesturl);
518                         // flag it as done..
519                        
520                         // write it...
521                         var target  = _t.toFilename(ret.requesturl);
522                         if (info && info.fromURL) {
523                             var bn = decodeURIComponent(File.basename(target));
524                             target = _t.toFilename(info.fromURL+'/'+ bn);
525                         }
526                         
527                         
528                         
529                         
530                         //File.write(target ,decodeURIComponent(escape(base64.decode( ret.data))));
531                         print("GOT array sized: " + ret.data.length);
532                         
533                         File.writeBinaryArray(target,ret.data);
534                         print("Wrote to file: " + target);
535                         break;
536                 }
537                 
538                 // if it's HTML then add it to parse queue
539                 // otehrwise save it.. and run the queue again.
540                
541                 
542                 //File.write(storedir+"/parsequeue/"+Math.random(), msg);
543             }
544             
545            
546             if (ret.method == 'downloadhead'){
547                 // got the results from download page:
548                 // requesturl 
549                 // remove from downloadqueue.
550                 
551                 //
552                 try { 
553                     var mt = ret.contentType.split(';').shift();
554                 } catch( e) {
555                     print("INVALID CONTENT TYPE? \n" + JSON.stringify(ret));
556                     mt='';
557                 }
558                 
559                 switch(mt) {
560                     case 'text/html':
561                         // add to parse QUEUE..
562                         print("moving to parse queue");
563                         _t.moveToParse(ret.requesturl);
564                         break;
565                     // stuf we do not care about..
566                     case 'application/atom+xml':
567                     case '':
568                         print("moving to done queue");
569                          _t.moveToDone(ret.requesturl);
570                         break;
571                     
572                     default:
573                         print("calling download file");
574                         _t.downloadpage( ret.requesturl );
575                         // keep it on the queue..
576                         // do not run the queue..
577                         return true;
578                      
579                 }
580                 
581                 // if it's HTML then add it to parse queue
582                 // otehrwise save it.. and run the queue again.
583                
584                 
585                 //File.write(storedir+"/parsequeue/"+Math.random(), msg);
586             }
587             _t.queuerun();
588             
589             return true;
590         });
591         
592         
593         this.toFilename = function(url)
594         {
595             url = url.replace(/^http[s]*:\/\//, '');
596             var p = url.split('/');
597             p.unshift(storedir+'/output');
598             for (var i =1 ;i < p.length; i++) {
599                 p[i] = encodeURIComponent(p[i]);
600             
601             }
602             p[p.length-1] = decodeURIComponent(p[p.length-1]);
603             ret = p.join('/');
604             var dir = File.dirname(ret);
605             File.mkdirall(dir);
606             return ret;
607             
608         }
609         this.checkdomain = function(comp)
610         {
611             var b = parseUri(this.uri);
612             var d = parseUri(comp);
613             return (d.host == b.host && d.protocol == b.protocol);
614             
615             
616         }
617         
618         this.dupeCheck = function(url)
619         {
620             
621            // order - return highest up the queue first..
622             if (File.exists(downloaddir +'/' + encodeURIComponent(url))) {
623                 return downloaddir +'/' + encodeURIComponent(url);
624             }
625              if (File.exists(parsedir +'/' + encodeURIComponent(url))) {
626                 return parsedir +'/' + encodeURIComponent(url);
627             }
628             if (File.exists(donedir +'/' + encodeURIComponent(url))) {
629                 return donedir +'/' + encodeURIComponent(url);
630             }
631             return  false;
632             
633             
634         }
635         this.moveToParse = function(url)
636         {
637             var old = this.dupeCheck(url);
638             var target =parsedir +'/' + encodeURIComponent(url);
639             if (old == target) {
640                 return;
641             }
642             File.write(target, old ? File.read(old) : '');
643             if (old) {
644                 File.remove(old);
645             }
646             
647         }
648         
649         this.moveToDownload= function(url)
650         {
651             var old = this.dupeCheck(url);
652             var target =downloaddir +'/' + encodeURIComponent(url);
653             if (old == target) {
654                 return;
655             }
656             File.write(target, old ? File.read(old) : '');
657             if (old) {
658                 File.remove(old);
659             }
660             
661         }
662         this.moveToDone= function(url)
663         {
664             var old = this.dupeCheck(url);
665             var target = donedir +'/' + encodeURIComponent(url);
666             if (old == target) {
667                 return;
668             }
669             File.write(target, old ? File.read(old) : '');
670             if (old) {
671                 File.remove(old);
672             }
673             
674         }
675         
676     }
677 });
678
679 function parseUri (str) {
680         var     o   = parseUri.options,
681                 m   = o.parser[o.strictMode ? "strict" : "loose"].exec(str),
682                 uri = {},
683                 i   = 14;
684
685         while (i--) uri[o.key[i]] = m[i] || "";
686
687         uri[o.q.name] = {};
688         uri[o.key[12]].replace(o.q.parser, function ($0, $1, $2) {
689                 if ($1) uri[o.q.name][$1] = $2;
690         });
691
692         return uri;
693 };
694
695 parseUri.options = {
696         strictMode: false,
697         key: ["source","protocol","authority","userInfo","user","password","host","port","relative","path","directory","file","query","anchor"],
698         q:   {
699                 name:   "queryKey",
700                 parser: /(?:^|&)([^&=]*)=?([^&]*)/g
701         },
702         parser: {
703                 strict: /^(?:([^:\/?#]+):)?(?:\/\/((?:(([^:@]*)(?::([^:@]*))?)?@)?([^:\/?#]*)(?::(\d*))?))?((((?:[^?#\/]*\/)*)([^?#]*))(?:\?([^#]*))?(?:#(.*))?)/,
704                 loose:  /^(?:(?![^:@]+:[^:@\/]*@)([^:\/?#.]+):)?(?:\/\/)?((?:(([^:@]*)(?::([^:@]*))?)?@)?([^:\/?#]*)(?::(\d*))?)(((\/(?:[^?#](?![^?#\/]*\.[^?#\/.]+(?:[?#]|$)))*\/?)?([^?#\/]*))(?:\?([^#]*))?(?:#(.*))?)/
705         }
706 };