) * - Frames * - RSS+ATOM-feeds * - Removing lots of unnecessary code * - Optionally replacing images with their alt-text * - Wap-pages passthru * - Follow -tags * * Limitations: * - Pages that require an Authorization username and password (not implemented for safety reasons) * - If you come accross other limitations, please let me know * **/ session_start(); class Phonifier { var $baseurl = ""; //Baseurl of the phonifier var $user_agent = 'Mozilla/5.0 (compatible; Phonifier; +http://www.phonifier.com)'; //The name of the user-agent var $time_out = 5; //Maximum time you want fsockopen to access the url var $data = ""; //The outputdata var $header = ""; //The headerdata var $scheme = ""; //HTTP OR HTTPS var $port = 80; //HTTP=80 (default), HTTPS=443 var $times = 0; //Number of redirects followed var $iswap = false; //Is the accessed page a wml-page? var $post = ""; //Post-values var $method = "GET"; //GET (default), POST var $url = "http://"; //The url the parser has to access var $img = true; //Show images in the result or replace them with their alt-text var $urlinfo; //Result of parse_url() var $contenttype = ""; //Content-type of the result var $contentlength = 0; //Content-lenght of the result var $starttime = 0; //Start-time, used for benchmarking var $error = false; //No errors yet :-) var $errortype = ""; //The errortype (for debugging) var $feed = false; //Is this a feed? //The template used for displaying the result var $template =" %1\$s

images on /off
%7\$s
top "; /** * Make an instance of the Phonifier-class * * $url (string) : The url that you want to access **/ function Phonifier($url) { //get the baseurl of the phonifier $this->baseurl = preg_replace("/\/$/","",str_replace('\\','/',dirname("http://".$_SERVER['HTTP_HOST'].$_SERVER['SCRIPT_NAME']))); $this->starttime = $this->microtime_float(); $this->url = $url; //If there is a POST, set the variables right if(isset($_POST) && sizeof($_POST)>0) { foreach($_POST as $name=>$value ){ $this->post .= urlencode($name)."=".urlencode($value).'&'; } $this->post = substr($this->post,0,-1); $this->method = "POST"; } } /** * This function activates the Phonifier * Run this after you set values like $img **/ function run() { if(eregi("^(http|https)+(:\/\/)",$this->url)) { //initiate vars $this->init(); if(!$this->error) { //get the contents of the url $this->get(); } } else { //the url doesnt start with http(s) $this->error = true; $this->errortype = "Url not valid, no http(s)://"; $this->contenttype = "text/html; charset=iso-8859-1"; $this->data = sprintf($this->template,"",$this->contenttype,$this->baseurl,htmlspecialchars($this->url,ENT_QUOTES),$this->img?" checked='checked'":"",$this->img?"":" checked='checked'",empty($this->url)?"Insert url in textfield and press 'go'":"Error: the url is not valid"); } } /** * Initiate function * This function sets variables to the right value **/ function init() { //url seems valid... $this->data = ""; $this->header = ""; $this->urlinfo = @parse_url($this->url); //get the hostname withoud www $host = eregi_replace("^www.","",$_SERVER['HTTP_HOST']); if(eregi($host,$this->urlinfo['host']) || eregi($_SERVER['SERVER_ADDR'],$this->urlinfo['host']) || eregi("127.0.0.1",$this->urlinfo['host']) || eregi("localhost",$this->urlinfo['host'])) { //don't call yourself $this->error = true; $this->errortype = "Own domain"; $this->contenttype = "text/html; charset=iso-8859-1"; $this->data = sprintf($this->template,"",$this->contenttype,$this->baseurl,htmlspecialchars($this->url,ENT_QUOTES),$this->img?" checked='checked'":"",$this->img?"":" checked='checked'","Error: choose a domain outside {$host}"); } //when the scheme is https fsockopen wants a 'ssl://'- url //and off course a different port if($this->urlinfo['scheme']=='https') { $this->port = 443; $this->scheme = "ssl://"; } } /** * returns time * Used for benchmarking **/ function microtime_float() { list($usec, $sec) = explode(" ", microtime()); return ((float)$usec + (float)$sec); } /** * The get function * These are the 'brains' of the Phonifier **/ function get() { //do the request $this->request(); $requesttime = round($this->microtime_float() - $this->starttime,5); //Put the cookie in the session when set if(preg_match_all("/Set-Cookie: (.*)=(.*);/Uis",$this->header, $cookies)) { if(!isset($_SESSION[$this->urlinfo['host']])) { $_SESSION[$this->urlinfo['host']] = array(); } for($i=0;$iurlinfo['host']][$cookies[1][$i]])) { unset($_SESSION[$this->urlinfo['host']][$cookies[1][$i]]); } else { $_SESSION[$this->urlinfo['host']][$cookies[1][$i]] = $cookies[2][$i]; } } } //Check contenttype if(preg_match("/Content-Type:(.*)\\n/Uis",$this->header, $content_type)) { $this->contenttype = trim($content_type[1]); $this->iswap = (strpos($this->contenttype,"text/vnd.wap.wml") !== false)? true : false; //rss and atom feeds if(eregi("^(application|text)\/(atom\+)?xml",$this->contenttype)) { $this->error = true; $this->feed = true; $this->data = $this->feedToHtml(); } //anything except (x)html or wap else if(!eregi("^text\/html",$this->contenttype) && !eregi("application\/xhtml\+xml",$this->contenttype) && !$this->iswap) { $this->error = true; $this->errortype = "Wrong Content-type"; $this->contenttype = "text/html; charset=iso-8859-1"; $this->data = sprintf($this->template,"Wrong Content-type",$this->contenttype,$this->baseurl,htmlspecialchars($this->url,ENT_QUOTES),$this->img?" checked='checked'":"",$this->img?"":" checked='checked'","The page that I found cannot be optimized for use on a mobile device.
Click here to access the address without optimization."); } } //rewrite the urls if(!$this->error) { $this->rewrite(); } $rewritetime = round($this->microtime_float() -$this->starttime,5); //clean non-used tags and meta-data if(!$this->iswap && !$this->error) { $this->tiny(); } $cleantime = round($this->microtime_float() -$this->starttime,5); $this->data = eregi_replace("(\r\n|\r|\n| )+", " ", $this->data); //remove all spaces and linebreaks if(!$this->iswap) { $this->data.= "\r\n"; //benchmarks } if($this->feed) { $this->error = false; } $this->contentlength = strlen($this->data); $this->contenttype = empty($this->contenttype)? "text/html; charset=iso-8859-1":$this->contenttype; } function contentlength() { $this->contentlength = strlen($this->data); return $this->contentlength; } /** * The request-function. It checks the url and handles the request **/ function request() { if ($this->url == "" or $this->url == "http://" or $this->url == "https://") { $this->error = true; $this->errortype = "Insert url in textfield and press 'go'"; $this->contenttype = "text/html; charset=iso-8859-1"; $this->data = sprintf($this->template,"",$this->contenttype,$this->baseurl,htmlspecialchars($this->url,ENT_QUOTES),$this->img?" checked='checked'":"",$this->img?"":" checked='checked'","Insert url in textfield and press 'go'"); } else { //every time we call this function we count it... we don't want to run forever... if($this->times<4) { if(isset($this->urlinfo['host']) && isset($this->urlinfo['scheme'])) { //open the socket $fp=@fsockopen($this->scheme.$this->urlinfo['host'], $this->port, $errno, $errstr, $this->time_out); if($fp) { //get the data /***************[BEGIN]***************/ $receivingheaders = true; $this->data = ""; $this->header = ''; $this->urlinfo['path'] = isset($this->urlinfo['path'])? $this->urlinfo['path'] : ""; $accept = isset($_SERVER['HTTP_ACCEPT']) ? $_SERVER['HTTP_ACCEPT'] : "*/*"; $head = "{$this->method} {$this->urlinfo['path']}?".(isset($this->urlinfo['query'])?"{$this->urlinfo['query']}":"")." HTTP/1.0\r\nHost: {$this->urlinfo['host']}\r\nUser-Agent: {$this->user_agent}\r\nAccept: {$accept}\r\n"; if(isset($_SESSION[$this->urlinfo['host']]) && sizeof($_SESSION[$this->urlinfo['host']])>0) { $head.= 'Cookie: '; foreach ($_SESSION[$this->urlinfo['host']] as $cookieKey => $cookieVal ) { $head.= $cookieKey."=".urlencode($cookieVal)."; "; } $head = substr($head,0,-2) . "\r\n"; } if(!empty($this->post)) { $length = strlen($this->post); $head.= "Content-Type: application/x-www-form-urlencoded\r\n"; $head.= "Content-Length: $length\r\n"; $head.= "\r\n"; $head.= $this->post; } $head.= "\r\n"; fputs($fp,$head); while(!feof($fp)) { if($data=trim(@fgets($fp, 8192))) { if(!$receivingheaders) { $this->data.=$data."\r\n"; } else { $this->header.=$data."\r\n"; } } else { $receivingheaders = false; } } /***************[/END]***************/ fclose($fp); //now we have the result. but there can be problems... //check if we get a new Location $location = ""; preg_match("/Location:(.*)\\n/Uis",$this->header, $loc); preg_match("//Uis",$this->data, $loc2); if(sizeof($loc)==2) { $location = $loc[1]; } else if(sizeof($loc2)==5) { $location = $loc2[3]; } if(!empty($location)) { $this->url = eregi_replace("^./","/",trim($location)); //not a valid redirect... but we take care of it if($this->url{0}=="/") { $this->url = "{$this->urlinfo['scheme']}://{$this->urlinfo['host']}{$this->url}"; } else if(!eregi("^(http|https)+(:\/\/)",$this->url)) { $this->url = "{$this->urlinfo['scheme']}://{$this->urlinfo['host']}/{$this->url}"; } if(eregi("^(http|https)+(:\/\/)",$this->url)) { //re-initiate vars $this->init(); $this->post=""; $this->times++; $this->request(); } } //or is there is a bad request...try again else if(eregi("(400 )?Bad Request",$this->header) || eregi("302 Found",$this->header) || strlen(trim($this->header))==0) { //possible error of missing the last /.. change it and try again $this->url.='/'; $this->urlinfo['path'].= '/'; $this->post=""; $this->times++; $this->request(); } //or is the page moved? else if(eregi("301 Moved Permanently",$this->header)) { //we checked this first but if there isn't a //Location-header we can't do anything $this->error = true; $this->errortype = "Redirect misunderstood"; $this->contenttype = "text/html; charset=iso-8859-1"; $this->data = sprintf($this->template,"",$this->contenttype,$this->baseurl,htmlspecialchars($this->url,ENT_QUOTES),$this->img?" checked='checked'":"",$this->img?"":" checked='checked'","Error: Redirect misunderstood.
Click here to access the address without optimization."); } //or doesn't the page exist? else if(eregi("404 Not Found",$this->header)) { $this->error = true; $this->errortype = "404"; $this->contenttype = "text/html; charset=iso-8859-1"; $this->data = sprintf($this->template,"404 File Not Found",$this->contenttype,$this->baseurl,htmlspecialchars($this->url,ENT_QUOTES),$this->img?" checked='checked'":"",$this->img?"":" checked='checked'","Error 404: The page you want to visit does not exist"); } //or is the page password protected? else if(eregi("401 Authorization Required",$this->header)) { $this->error = true; $this->errortype = "Authorization required"; $this->contenttype = "text/html; charset=iso-8859-1"; $this->data = sprintf($this->template,"401 Authorization Required",$this->contenttype,$this->baseurl,htmlspecialchars($this->url,ENT_QUOTES),$this->img?" checked='checked'":"",$this->img?"":" checked='checked'","The page you want to visit is password-protected. Click here to access this page."); } } else { $this->error = true; $this->errortype = "Timeout"; $this->contenttype = "text/html; charset=iso-8859-1"; $this->data = sprintf($this->template,"",$this->contenttype,$this->baseurl,htmlspecialchars($this->url,ENT_QUOTES),$this->img?" checked='checked'":"",$this->img?"":" checked='checked'","Error: the request timed out.
Click here to access the address without optimization."); } } else { $this->error = true; $this->errortype = "No optimization"; $this->contenttype = "text/html; charset=iso-8859-1"; $this->data = sprintf($this->template,"",$this->contenttype,$this->baseurl,htmlspecialchars($this->url,ENT_QUOTES),$this->img?" checked='checked'":"",$this->img?"":" checked='checked'","Error: the url could not be optimized.
Click here to access the address without optimization."); } } else { $this->error = true; $this->errortype = "Too many redirects"; $this->contenttype = "text/html; charset=iso-8859-1"; $this->data = sprintf($this->template,"",$this->contenttype,$this->baseurl,htmlspecialchars($this->url,ENT_QUOTES),$this->img?" checked='checked'":"",$this->img?"":" checked='checked'","Error: redirected too many times.. quitting.
Click here to access the address without optimization."); } } } /** * Rewrite the urls in de responsedata **/ function rewrite() { $base = "{$this->urlinfo['scheme']}://{$this->urlinfo['host']}"; $path = $this->urlinfo['path']; $baseuri = ""; $img = $this->img? 1 : 0; if(!eregi("/$",$path)) { $path = preg_replace("/\/$/","",str_replace('\\','/',dirname($path))); } //if there is a base-url specified.. use it in every link preg_match("//Ui",$this->data, $baseurl); if(sizeof($baseurl)==4) { $baseuri = $baseurl[2]; } /** * The function that replaces the urls * This function is not my favourite, it must be possible to do it better **/ function replaceUrl($base,$path,$baseuri,$img,$type,$a,$b,$c) { $type = strtolower($type); if(!empty($baseuri)) { $current = $baseuri; } else { $current = $base.$path; } if(eregi("^/",$b)) { if(substr($b,0,2)!="//") { $b = (empty($baseuri)?$base:$baseuri).$b; } else { $b = "http:".$b; } } else if(eregi("^../",$b)) { if($base==$current) { $b = $current.substr($b,2); } else { $b = $current.(eregi("/$",$current)?"":"/").$b; } } else if(eregi("^mailto:",$b)) { $b = "#"; } else if(eregi("^#",$b)) { $b = $b; } else if(eregi("^javascript:",$b)) { $b = "#"; } else if(!eregi("^(http|https)://",$b)) { $b = $current.(substr($current,-1)=="/"?"":"/").$b; } if($type=="href" && !eregi("^#",$b)) { $output = $type.'='.$a."?i={$img}&u=".urlencode(html_entity_decode($b)).$c; } else if($type=="action" && !eregi("^#",$b)) { $scheme = substr($b,0,strpos($b, "/")-1); $b = trim(substr($b,strpos($b, "/")+2)); $output = $type.'='.$a."index.php/{$img}/{$scheme}/".$b.$c; } else { $output = $type.'='.$a.$b.$c; } return stripslashes($output); } $this->data = preg_replace("/(href|action|src|;url)+=(['\"])?+(.+)([\"'> ])/Uie","replaceUrl('$base','$path','$baseuri','$img','\\1','\\2','\\3','\\4')",$this->data); } /** * Clean all tags you don't want **/ function tiny() { if(preg_match("/text\/html\; charset=(.*)['\"]/iU",$this->data, $ct)) { $this->contenttype = "text/html; charset={$ct[1]}"; } $title = preg_match("/(.*)<\/title>/i",$this->data, $titlecontents)? $titlecontents[1] : ""; $search = array( '@<!--(.*)-->@Usi' //html-comments ,'@\/\/<!\[CDATA\[.*?\/\/\]\]>@ism' ,'@on(click|mouseover|mouseout|blur|error|focus|load|unload|submit|reset|abort|change|select)=(\'|").*(["\'])+( |>)@Usi' //all javascript-triggers that results in an js-error ,'@<link(.*)media=(\'|")(screen|print)(\'|").*?'.'>\r\n@Uie' //remove stylesheets that a mobile phone doesn't use ,'@<(noedit|iframe|script)[^>]*?'.'>.*?<\/(noedit|iframe|script)>@ism' //script and object-tags ,'@<(head|object|style|map)[^>]*?'.'>.*?<\/(head|object|style|map)>@ism' //script and object-tags ,'@<(body|p)[^>]*?'.'>@ism' //empty body-tag ,'@.*<html[^>]*?'.'>@ism' ,'@</tr>@ism' ,'@<(\/)?(html|body|div|span|link|meta|font|center|noscript|frameset|noframes|table|tr|th|td|tbody|thead|tfoot)[^>]*?'.'>@ism' //remove tables,div,span,link,meta ,'@<\/p>(\r\n)+\|(\r\n)+<p>@ism' ,'@((<br ?/?'.'>)+((\r)?\n)*)+@i' ,'@(\r\n)+@' ,'@(style)=(\'|").*(["\'])+( |>)@Usi' ,'@<frame.*src=(\'|")(.*)(\'|").*>@Uis' ,'@(target)=(\'|").*(["\'])+( |>)@Usi' ); $replace = array( '' ,'' ,'\4' ,'' ,'' ,'' ,'<\1>' ,'' ,'<br />' ,'' ,' | ' ,'<br />' ,' ' ,'\4' ,'Frame: <a href=\'?i=1&u=\2\'>\2</a><br />' ,'' ); if(!$this->img) //filter out images { //images with alt $search[] = "/<img[^>]* alt=(\"([^\"]+)\"|'([^']+)'|([^\"'> ]+))[^>]*>/i"; $replace[] = "[$2$3$4] "; //images without alt $search[] = '/<img.*[^>]*?'.'>/Ui'; $replace[] = '[img]'; } $this->data = preg_replace($search, $replace, $this->data); $this->data = sprintf($this->template,$title,$this->contenttype,$this->baseurl, htmlspecialchars($this->url,ENT_QUOTES),$this->img?" checked='checked'":"",$this->img?"":" checked='checked'",$this->data); } /** * When accessing a RSS or ATOM feed this function is called to make the feed readable **/ function feedToHtml() { $outputhtml = ""; $atom = false; $channelCount = preg_match_all("|(xml.*encoding=['\"](.*)['\"].*>.*)?<channel>.*<title>(.*).*(.*).*|iUs",$this->data,$channels,PREG_SET_ORDER); if($channelCount==0) { $channelCount = preg_match_all("|(xml.*encoding=['\"](.*)['\"].*>.*)?.*(.*).*|iUs",$this->data,$channels,PREG_SET_ORDER); $atom = true; } $channels = reset($channels); $outputhtml.= "

{$channels[3]}


"; if($atom) { $itemCount = preg_match_all("|(.*)|iUs",$this->data,$items,PREG_SET_ORDER); } else { $itemCount = preg_match_all("|(.*)|iUs",$this->data,$items,PREG_SET_ORDER); } if($itemCount>0) { foreach($items as $item) { $str_title = ""; $str_link = ""; $str_descr = ""; $linkmatch = $atom? "||iUs" : "|(.*)|iUs"; $descrmatch = $atom? "|(.*)|iUs" : "|(.*)|iUs"; if(preg_match_all("|(.*)|iUs",$item[0],$title,PREG_SET_ORDER)) { $str_title = $title[0][1]; } if(preg_match_all($linkmatch,$item[0],$link,PREG_SET_ORDER)) { $str_link = $link[0][1]; } if(preg_match_all($descrmatch,$item[0],$descr,PREG_SET_ORDER)) { $str_descr = str_replace("","",$descr[0][1])); } $outputhtml.= "

{$str_title}

"; $outputhtml.= "{$str_descr}

"; } } else { $outputhtml.= "Feed couldn't be parsed"; } $this->data = $outputhtml; $charset = empty($channels[2])?"UTF-8":$channels[2]; $this->contenttype = "text/html; charset=$charset"; $this->rewrite(); $outputhtml = sprintf($this->template,"{$channels[3]}",$this->contenttype,$this->baseurl,$this->url,$this->img?" checked='checked'":"",$this->img?"":" checked='checked'",$this->data); return $outputhtml; } } ?>