2 && $httpcode <> 3) { $status['state'] = "Unreachable: http $full_httpcode"; $linkstate = "Unreachable"; } } if ($linkstate <> "Unreachable") { while ($answer) { $answer = fgets($fp, 4096); if (ereg("Location: *([^\n\r ]+)", $answer, $regs) && $httpcode == 3 && $full_httpcode != 302) { $status['path'] = $regs[1]; $status['state'] = "Relocation: http $full_httpcode"; fclose($fp); return $status; } if (eregi("Last-Modified: *([a-z0-9,: ]+)", $answer, $regs)) { $status['date'] = $regs[1]; } if (eregi("Content-Type:", $answer)) { $content = $answer; $answer = ''; break; } } $socket_status = socket_get_status($fp); if (eregi("Content-Type: *([a-z/.-]*)", $content, $regs)) { if ($regs[1] == 'text/html' || $regs[1] == 'text/' || $regs[1] == 'text/plain') { $status['content'] = 'text'; $status['state'] = 'ok'; } else if ($regs[1] == 'application/pdf' && $index_pdf == 1) { $status['content'] = 'pdf'; $status['state'] = 'ok'; } else if (($regs[1] == 'application/msword' || $regs[1] == 'application/vnd.ms-word') && $index_doc == 1) { $status['content'] = 'doc'; $status['state'] = 'ok'; } else if (($regs[1] == 'application/excel' || $regs[1] == 'application/vnd.ms-excel') && $index_xls == 1) { $status['content'] = 'xls'; $status['state'] = 'ok'; } else if (($regs[1] == 'application/mspowerpoint' || $regs[1] == 'application/vnd.ms-powerpoint') && $index_ppt == 1) { $status['content'] = 'ppt'; $status['state'] = 'ok'; } else { $status['state'] = "Not text or html"; } } else if ($socket_status['timed_out'] == 1) { $status['state'] = "Timed out (no reply from server)"; } else $status['state'] = "Not text or html"; } } fclose($fp); return $status; } /* Read robots.txt file in the server, to find any disallowed files/folders */ function check_robot_txt($url) { global $user_agent; $urlparts = parse_url($url); $url = 'http://'.$urlparts['host']."/robots.txt"; $url_status = url_status($url); $omit = array (); if ($url_status['state'] == "ok") { $robot = file($url); if (!$robot) { $contents = getFileContents($url); $file = $contents['file']; $robot = explode("\n", $file); } $regs = Array (); $this_agent= ""; while (list ($id, $line) = each($robot)) { if (eregi("^user-agent: *([^#]+) *", $line, $regs)) { $this_agent = trim($regs[1]); if ($this_agent == '*' || $this_agent == $user_agent) $check = 1; else $check = 0; } if (eregi("disallow: *([^#]+)", $line, $regs) && $check == 1) { $disallow_str = eregi_replace("[\n ]+", "", $regs[1]); if (trim($disallow_str) != "") { $omit[] = $disallow_str; } else { if ($this_agent == '*' || $this_agent == $user_agent) { return null; } } } } } return $omit; } /* Remove the file part from an url (to build an url from an url and given relative path) */ function remove_file_from_url($url) { $url_parts = parse_url($url); $path = $url_parts['path']; $regs = Array (); if (preg_match('/([^\/]+)$/i', $path, $regs)) { $file = $regs[1]; $check = $file.'$'; $path = preg_replace("/$check"."/i", "", $path); } if ($url_parts['port'] == 80 || $url_parts['port'] == "") { $portq = ""; } else { $portq = ":".$url_parts['port']; } $url = $url_parts['scheme']."://".$url_parts['host'].$portq.$path; return $url; } /* Extract links from html */ function get_links($file, $url, $can_leave_domain, $base) { $chunklist = array (); // The base URL comes from either the meta tag or the current URL. if (!empty($base)) { $url = $base; } $links = array (); $regs = Array (); $checked_urls = Array(); preg_match_all("/href\s*=\s*[\'\"]?([+:%\/\?~=&;\\\(\),._a-zA-Z0-9-]*)(#[.a-zA-Z0-9-]*)?[\'\" ]?(\s*rel\s*=\s*[\'\"]?(nofollow)[\'\"]?)?/i", $file, $regs, PREG_SET_ORDER); foreach ($regs as $val) { if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') { $links[] = $a; } $checked_urls[$val[1]] = 1; } } preg_match_all("/(frame[^>]*src[[:blank:]]*)=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER); foreach ($regs as $val) { if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') { $links[] = $a; } $checked_urls[$val[1]] = 1; } } preg_match_all("/(window[.]location)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER); foreach ($regs as $val) { if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') { $links[] = $a; } $checked_urls[$val[1]] = 1; } } preg_match_all("/(http-equiv=['\"]refresh['\"] *content=['\"][0-9]+;url)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER); foreach ($regs as $val) { if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') { $links[] = $a; } $checked_urls[$val[1]] = 1; } } preg_match_all("/(window[.]open[[:blank:]]*[(])[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER); foreach ($regs as $val) { if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') { $links[] = $a; } $checked_urls[$val[1]] = 1; } } return $links; } /* Function to build a unique word array from the text of a webpage, together with the count of each word */ function unique_array($arr) { global $min_word_length; global $common; global $word_upper_bound; global $index_numbers, $stem_words; if ($stem_words == 1) { $newarr = Array(); foreach ($arr as $val) { $newarr[] = stem($val); } $arr = $newarr; } sort($arr); reset($arr); $newarr = array (); $i = 0; $counter = 1; $element = current($arr); if ($index_numbers == 1) { $pattern = "/[a-z0-9]+/"; } else { $pattern = "/[a-z]+/"; } $regs = Array (); for ($n = 0; $n < sizeof($arr); $n ++) { //check if word is long enough, contains alphabetic characters and is not a common word //to eliminate/count multiple instance of words $next_in_arr = next($arr); if ($next_in_arr != $element) { if (strlen($element) >= $min_word_length && preg_match($pattern, remove_accents($element)) && (@ $common[$element] <> 1)) { if (preg_match("/^(-|\\\')(.*)/", $element, $regs)) $element = $regs[2]; if (preg_match("/(.*)(\\\'|-)$/", $element, $regs)) $element = $regs[1]; $newarr[$i][1] = $element; $newarr[$i][2] = $counter; $element = current($arr); $i ++; $counter = 1; } else { $element = $next_in_arr; } } else { if ($counter < $word_upper_bound) $counter ++; } } return $newarr; } /* Checks if url is legal, relative to the main url. */ function url_purify($url, $parent_url, $can_leave_domain) { global $ext, $mainurl, $apache_indexes, $strip_sessids; $urlparts = parse_url($url); $main_url_parts = parse_url($mainurl); if ($urlparts['host'] != "" && $urlparts['host'] != $main_url_parts['host'] && $can_leave_domain != 1) { return ''; } reset($ext); while (list ($id, $excl) = each($ext)) if (preg_match("/\.$excl$/i", $url)) return ''; if (substr($url, -1) == '\\') { return ''; } if (isset($urlparts['query'])) { if ($apache_indexes[$urlparts['query']]) { return ''; } } if (preg_match("/[\/]?mailto:|[\/]?javascript:|[\/]?news:/i", $url)) { return ''; } if (isset($urlparts['scheme'])) { $scheme = $urlparts['scheme']; } else { $scheme =""; } //only http and https links are followed if (!($scheme == 'http' || $scheme == '' || $scheme == 'https')) { return ''; } //parent url might be used to build an url from relative path $parent_url = remove_file_from_url($parent_url); $parent_url_parts = parse_url($parent_url); if (substr($url, 0, 1) == '/') { $url = $parent_url_parts['scheme']."://".$parent_url_parts['host'].$url; } else if (!isset($urlparts['scheme'])) { $url = $parent_url.$url; } $url_parts = parse_url($url); $urlpath = $url_parts['path']; $regs = Array (); while (preg_match("/[^\/]*\/[.]{2}\//", $urlpath, $regs)) { $urlpath = str_replace($regs[0], "", $urlpath); } //remove relative path instructions like ../ etc $urlpath = preg_replace("/\/+/", "/", $urlpath); $urlpath = preg_replace("/[^\/]*\/[.]{2}/", "", $urlpath); $urlpath = str_replace("./", "", $urlpath); $query = ""; if (isset($url_parts['query'])) { $query = "?".$url_parts['query']; } if ($main_url_parts['port'] == 80 || $url_parts['port'] == "") { $portq = ""; } else { $portq = ":".$main_url_parts['port']; } $url = $url_parts['scheme']."://".$url_parts['host'].$portq.$urlpath.$query; //if we index sub-domains if ($can_leave_domain == 1) { return $url; } $mainurl = remove_file_from_url($mainurl); if ($strip_sessids == 1) { $url = remove_sessid($url); } //only urls in staying in the starting domain/directory are followed $url = convert_url($url); if (strstr($url, $mainurl) == false) { return ''; } else return $url; } function save_keywords($wordarray, $link_id, $domain) { global $mysql_table_prefix, $all_keywords; reset($wordarray); while ($thisword = each($wordarray)) { $word = $thisword[1][1]; $wordmd5 = substr(md5($word), 0, 1); $weight = $thisword[1][2]; if (strlen($word)<= 30) { $keyword_id = $all_keywords[$word]; if ($keyword_id == "") { mysql_query("insert into ".$mysql_table_prefix."keywords (keyword) values ('$word')"); if (mysql_errno() == 1062) { $result = mysql_query("select keyword_ID from ".$mysql_table_prefix."keywords where keyword='$word'"); echo mysql_error(); $row = mysql_fetch_row($result); $keyword_id = $row[0]; } else{ $keyword_id = mysql_insert_id(); $all_keywords[$word] = $keyword_id; echo mysql_error(); } } $inserts[$wordmd5] .= ",($link_id, $keyword_id, $weight, $domain)"; } } for ($i=0;$i<=15; $i++) { $char = dechex($i); $values= substr($inserts[$char], 1); if ($values!="") { $query = "insert into ".$mysql_table_prefix."link_keyword$char (link_id, keyword_id, weight, domain) values $values"; mysql_query($query); echo mysql_error(); } } } function get_head_data($file) { $headdata = ""; preg_match("@
]*>(.*?)<\/head>@si",$file, $regs); $headdata = $regs[1]; $description = ""; $robots = ""; $keywords = ""; $base = ""; $res = Array (); if ($headdata != "") { preg_match("/'\"]+)[\"']?/i", $headdata, $res); if (isset ($res)) { $robots = $res[1]; } preg_match("/'\"]+)[\"']?/i", $headdata, $res); if (isset ($res)) { $description = $res[1]; } preg_match("/'\"]+)[\"']?/i", $headdata, $res); if (isset ($res)) { $keywords = $res[1]; } // e.g.