--- /dev/null
+<?php
+/**
+ * Table Definition for cash_costing_map
+ */
+require_once 'DB/DataObject.php';
+
+
+class Pman_Reddit_DataObjects_Reddit_livefeed_embed extends DB_DataObject
+{
+ ###START_AUTOCODE
+ /* the code below is auto generated do not remove the above tag */
+
+ public $__table = 'reddit_livefeed_embed'; // table name
+ public $id; // int(11) not_null primary_key auto_increment
+ public $feed_id; // string(254) not_null
+ public $seqid; // int(11) not_null
+ public $url; // string(254) not_null
+
+
+ /* the code above is auto generated do not remove the tag below */
+ ###END_AUTOCODE
+ function convertHTML(){
+ //var_dump($url);
+
+
+ $x = DB_DataObject::factory('reddit_livefeed');
+ $x->get('id',$this->feed_id);
+ $file = '/home/reddit/' . date('Y/m/d',strtotime($x->created_dt . " - 8 HOURS")) . "/" . $x->uid . '.' . $this->seqid . ".html";
+ if(file_exists($file)){
+ echo "file exists already.";
+ return;
+ }
+ $dir = dirname($file);
+ if(!file_exists($dir)){
+ echo $dir . "\n";
+ mkdir($dir,0775,true);
+ }
+ $url = 'http://www.redditmedia.com/mediaembed/liveupdate/tnc30xhiiqom/LiveUpdate_' . $x->uid . '/' . $this->seqid;
+ $cmd = "/usr/bin/xvfb-run seed app.webkitpdf/main.js -u " . $url . " -d 5000 -H " . $file;
+ echo "\n\n".$file."\n\n";
+ echo $cmd;
+ `$cmd`;
+ //escapeshellarg('http://www.redditmedia.com/mediaembed/liveupdate/tnc30xhiiqom/LiveUpdate_' . $this->uid . '/' . $i);
+ //$cmd = "/usr/bin/xvfb-run /usr/local/bin/wkhtmltopdf --javascript-delay 10000 --load-error-handling ignore '" . $url . "' " . $file;
+
+ $data=file_get_contents($file);
+
+
+ if (!strlen(trim($data))) {
+ return false;
+ }
+
+
+ // --- check configure code to see if we need to clean the css.. - this is expensive for CPU..
+ //
+
+
+// $data = file_get_contents('/home/chris/test_png/test.html');
+
+// print_r(strlen($data));
+ $html5Tag = array(
+ 'canvas', 'audio', 'embed', 'source', 'track',
+ 'video', 'datalist', 'keygen', 'output', 'article',
+ 'aside', 'bdi', 'details', 'dialog', 'figcaption',
+ 'figure', 'figure', 'footer', 'header', 'main',
+ 'mark', 'menuitem', 'meter', 'nav', 'progress',
+ 'rp', 'rt', 'ruby', 'section', 'summary', 'time',
+ 'wbr');
+// $retData = $data;
+
+ $testTag = "/\<(".implode('|', $html5Tag).")/i";
+
+ $retData = preg_replace($testTag, "<div", $data);
+
+ $testTag = "/\<\/(".implode('|', $html5Tag).")/i";
+ $data = preg_replace($testTag, "</div", $retData);
+
+// print_r($data);exit;
+
+ //for fix the HTML issue
+ $data = tidy_repair_string(mb_convert_encoding($data, 'HTML-ENTITIES', "UTF-8"),array(),'UTF8');
+// print_r($data);exit;
+ if($this->debug_on){
+ echo "Created tidy.... \n\n";
+ echo "<textarea style=\"width: 500px; height: 250px;\">{$data}</textarea>";
+ echo "\n\n";
+ }
+
+
+
+
+
+// print_r($data);exit;
+ libxml_use_internal_errors (true);
+ $doc = new DOMDocument();
+ $doc->loadHTML($data);
+// $doc->loadHTML($data);
+ $xpath = new DOMXpath($doc);
+
+
+
+
+
+ // fullpath images.
+ //foreach ($xpath->query('//img[@src]') as $img) {
+ //
+ // $img->setAttribute('src',
+ // $this->relPath($url, $img->getAttribute('src'))
+ // );
+ //
+ //}
+ //// fullpath links.
+ //foreach ($xpath->query('//a[@href]') as $a) {
+ //
+ // $a->setAttribute('href',
+ // $this->relPath($url, $a->getAttribute('href'))
+ // );
+ // $a->setAttribute('target', '_new');
+ //}
+ //
+
+ echo "about to do stylesheets\n";
+
+ // get stylesheets.
+ foreach ($xpath->query('//link[@href]') as $l) {
+
+
+
+ if($l->getAttribute('rel') != 'stylesheet'){
+ continue;
+ }
+
+
+
+ $href = $this->relPath($url, $l->getAttribute('href'));
+ $this->addStyleSheet($href);
+
+
+ }
+
+ //echo "about to do style\n";
+
+ foreach ($xpath->query('//style') as $l) {
+ $tc = '';
+ foreach ($l->childNodes as $child) {
+ $tc .= $child->ownerDocument->saveXML( $child );
+ }
+ $tc = str_replace('<![CDATA[', '', $tc);
+ $tc = str_replace(']]>', '', $tc);
+ //echo "add $tc\n";
+ $this->addStyleSheet($url, $tc);
+ //echo "\nadd stylesheet - complete\n";
+
+ }
+ //echo "removing styles\n";
+ // remove style blocks.
+ foreach ($xpath->query('//style') as $l) {
+ $l->parentNode->removeChild($l);
+ }
+ //echo "removing head\n";
+ // remove head..
+ foreach ($xpath->query('//head') as $l) {
+ $l->parentNode->removeChild($l);
+ }
+
+ $doc->formatOutput = true;
+
+ $data = $doc->saveHTML();
+// print_r($data);
+
+ if($this->debug_on){
+ echo "After remove style and CSS.... \n\n";
+ echo "<textarea style=\"width: 500px; height: 250px;\">{$data}</textarea>";
+ echo "\n\n";
+ }
+
+
+ echo "about to do overlay stylesheets\n";
+
+
+ //echo '<PRE>' . htmlspecialchars(print_r($this->styleSheets,true)).
+ // "\n\n". htmlspecialchars($data);exit;
+ require_once 'HTML/CSS/InlineStyle.php';
+ $htmldoc = new HTML_CSS_InlineStyle($data);
+ //print_R($this->styleSheets);
+
+ if(!empty($this->styleSheets)){
+
+ $htmldoc->applyStylesheet(array_values(
+ $this->styleSheets));
+ }
+ $html = $htmldoc->getHTML();
+
+
+
+// print_r($html);
+
+ echo "final cleanup\n";
+
+ // finally run it through again removing all the class's so they do not clash.
+ $doc = new DOMDocument();
+ $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"));
+ $xpath = new DOMXpath($doc);
+
+
+ // remove class
+ foreach ($xpath->query('//*[@class]') as $l) {
+ $l->removeAttribute('class');
+ }
+ foreach ($xpath->query('//*[@id]') as $l) {
+ $l->removeAttribute('id');
+ }
+
+
+
+ // html5 - htmltidy is borked for this..
+ //foreach(array(
+ // 'header','hgroup',
+ //
+ // // table tags cause layout nightmares....
+ // 'table','tr','td', 'tbody', 'thead'
+ //) as $tag) {
+ // foreach ($xpath->query('//'.$tag) as $l) {
+ // $dn = $l->ownerDocument->createElement('div');
+ // foreach ($l->attributes as $attr) {
+ // $dn->setAttribute($attr->nodeName, $attr->nodeValue);
+ // }
+ // while ($l->firstChild) {
+ //
+ // $dn->appendChild(
+ // $l->removeChild($l->firstChild)
+ // );
+ // }
+ //
+ // $l->parentNode->replaceChild($dn, $l);
+ //
+ // $l->removeAttribute('id');
+ // }
+ //}
+ $doc->formatOutput = true;
+
+ $html = $doc->saveHTML();
+
+// print_r($html);
+ //echo "about to do html_safe\n";
+ //echo $html;exit;
+
+ // and finally we are going to run it through HTML_Safe
+ // and remove the colours / width /background ?
+
+ /*require_once 'HTML/Safe.php';
+ $s = new HTML_Safe();
+ array_push($s->cssKeywords,
+ 'position','z-index',
+ 'width', 'height',
+ 'min-width', 'min-height',
+ 'float', 'position',
+ 'top', 'left','right','bottom',
+ // margin-* should be >0
+
+ // the general problem with these is that if they quite large, they skew the layout
+ // leaving alot of bllank space.
+ 'margin',
+ 'margin-right', 'margin-left',
+ 'padding-right', 'padding-left', 'padding',
+ 'border-right'
+
+ );*/
+ //$html = $s->parse($html);
+
+
+ if($this->debug_on){
+ echo "After HTML_Safe.... \n\n";
+ echo "<textarea style=\"width: 500px; height: 250px;\">{$data}</textarea>";
+ echo "\n\n";
+ }
+
+ //echo $html;exit;
+ // echo '<PRE>'. htmlspecialchars($html);exit;
+// print_r($html);
+
+ libxml_use_internal_errors (false);
+ //echo $html;
+
+ $file = '/home/reddit/' . date('Y/m/d',strtotime($x->created_dt . " - 8 HOURS")) . "/" . $x->uid . '.' . $this->seqid . ".clean.html";
+
+ file_put_contents($file,$html);
+
+ sleep(1);
+
+ }
+ function relPath($base, $url)
+ {
+ $base = trim($base);
+ $url = trim($url);
+ if (preg_match('/^(http|https|mailto):/',$url)) {
+ return $url;
+ }
+ $ui = parse_url($base);
+ // if it starts with '/'...
+ // we do not handle ports...
+ if (substr($url,0,2) == '//') {
+ return $this->cleanPath($ui['scheme'] .':' . $url);
+ }
+
+
+
+ if (substr($url,0,1) == '/') {
+ return $this->cleanPath($ui['scheme'] .'://'.$ui['host']. $url);
+ }
+
+ if (substr($ui['path'], -1) == '/') {
+ return $this->cleanPath($ui['scheme'] .'://'.$ui['host']. $ui['path'] . $url);
+ }
+ if (!strlen($ui['path'])) {
+ return $this->cleanPath($ui['scheme'] .'://'.$ui['host']. '/' . $url);
+
+ }
+ /// not sure if this will work...
+ return $this->cleanPath($ui['scheme'] .'://'.$ui['host']. $ui['path'] . '/../'. $url);
+
+ }
+ function cleanPath($url)
+ {
+ require_once 'Net/URL.php';
+ $u = new Net_URL($url);
+ $u->path = $u->resolvePath($u->path);
+ return $u->getURL();
+ }
+
+ function addStylesheet($url, $data=false)
+ {
+ //echo "addStylesheet $url\n";
+ $key = $url;
+ if ($data != false) {
+ // body data..
+ if (!isset($this->styleSheets[$url])) {
+ $this->styleSheets[$url] = '';
+ }
+
+ $data = $this->replaceImports($data, $url);
+ $data = $this->replaceImages($data, $url);
+ $this->styleSheets[$url] .= "\n\n". $data;
+
+ return;
+ }
+
+ // it's a url.. do not fetch twice..
+ if (isset($this->styleSheets[$url])) {
+ return;
+ }
+ // fetch it..
+
+
+ $ui = parse_url($url);
+ // we should cache this, otherwise we will be getting
+ // them rather frequently..
+ // and we need to do it cross process...
+ $cfile = '/tmp/'. $ui['host']. md5($url);
+ if (file_exists($cfile)) {
+ $data = file_get_contents($cfile);
+ } else {
+// var_dump($url);
+ require_once 'HTTP/Request.php';
+ $a = new HTTP_Request($url, array(
+ 'allowRedirects' => true,
+ 'maxRedirects' => 2
+ ));
+ $a->sendRequest();
+ // if this results in an errorr or redirect..
+ // we should log that somewhere.. and display it on the feed...
+
+ $data = $a->getResponseBody();
+ file_put_contents($cfile, $data);
+ }
+
+ if (empty($data)) {
+ echo "RETURNED EMPTY: $url";
+ return;
+ }
+ $data = $this->replaceImports($data, $url);
+ $data = $this->replaceImages($data, $url);
+ $this->styleSheets[$url] = $data;
+
+
+
+
+ }
+
+ function replaceImages($data,$url)
+ {
+
+
+ if(! preg_match_all('/url\(([^\)]+)\)/mi', $data, $matches)) {
+ return $data;
+ }
+
+ foreach($matches[0] as $i=>$m) {
+ // remove originall... make it meaningless..
+ $img= $matches[1][$i];
+ $img = trim($img, '"');
+ $img = trim($img, "'");
+ $str = 'url("'. $this->relPath($url, $img) .'")';
+ $data = str_replace($m, $str, $data);
+ }
+ return $data;
+ }
+
+ function replaceImports($data,$url)
+ {
+ if(!preg_match_all("/@import\s+([^\)]+)\)/mi", $data, $matches)){
+ return $data;
+ }
+ $urls = array();
+ foreach($matches[0] as $i=>$m) {
+ // remove originall... make it meaningless..
+ $data = str_replace($m, '', $data);
+ $cssurl= $matches[1][$i];
+ $cssurl = preg_replace('/url\(/i', '', $cssurl);
+ $cssurl = trim($cssurl, '"');
+ $cssurl = trim($cssurl, "'");
+ $cssurl = trim($cssurl);
+ // print_r(array($url, $cssurl,$this->relPath($url, $cssurl)));exit;
+
+ $this->addStylesheet($this->relPath($url, $cssurl));
+ }
+
+ return $data;
+ }
+}
\ No newline at end of file