2 && $httpcode <> 3) {
$status['state'] = "Unreachable: http $full_httpcode";
$linkstate = "Unreachable";
$realnum -- ;
}
}
if ($linkstate <> "Unreachable") {
while ($answer) {
$answer = fgets($fp, 4096);
if (ereg("Location: *([^\n\r ]+)", $answer, $regs) && $httpcode == 3 && $full_httpcode != 302) {
$status['path'] = $regs[1];
$status['state'] = "Relocation: http $full_httpcode";
fclose($fp);
return $status;
}
if (eregi("Last-Modified: *([a-z0-9,: ]+)", $answer, $regs)) {
$status['date'] = $regs[1];
}
if (eregi("Content-Type:", $answer)) {
$content = $answer;
$answer = '';
break;
}
}
$socket_status = socket_get_status($fp);
if (eregi("Content-Type: *([a-z/.-]*)", $content, $regs)) {
if ($regs[1] == 'text/html' || $regs[1] == 'text/' || $regs[1] == 'text/plain') {
$status['content'] = 'text';
$status['state'] = 'ok';
} else if ($regs[1] == 'application/pdf' && $index_pdf == 1) {
$status['content'] = 'pdf';
$status['state'] = 'ok';
} else if ($regs[1] == 'application/pdf' && $index_pdf == 0) {
$status['content'] = 'pdf';
$status['state'] = 'Indexing of PDF files is not activated in Admin Settings';
} else if (($regs[1] == 'application/msword' || $regs[1] == 'application/vnd.ms-word') && $index_doc == 1) {
$status['content'] = 'doc';
$status['state'] = 'ok';
} else if (($regs[1] == 'application/msword' || $regs[1] == 'application/vnd.ms-word') && $index_doc == 0) {
$status['content'] = 'doc';
$status['state'] = 'Indexing of DOC files is not activated in Admin Settings';
} else if (($regs[1] == 'text/rtf') && $index_rtf == 1) {
$status['content'] = 'rtf';
$status['state'] = 'ok';
} else if (($regs[1] == 'text/rtf') && $index_rtf == 0) {
$status['content'] = 'rtf';
$status['state'] = 'Indexing of RTF files is not activated in Admin Settings';
} else if (($regs[1] == 'application/excel' || $regs[1] == 'application/vnd.ms-excel') && $index_xls == 1) {
$status['content'] = 'xls';
$status['state'] = 'ok';
} else if (($regs[1] == 'application/excel' || $regs[1] == 'application/vnd.ms-excel') && $index_xls == 0) {
$status['content'] = 'xls';
$status['state'] = 'Indexing of XLS files is not activated in Admin Settings';
} else if (($regs[1] == 'application/mspowerpoint' || $regs[1] == 'application/vnd.ms-powerpoint') && $index_ppt == 1) {
$status['content'] = 'ppt';
$status['state'] = 'ok';
} else if (($regs[1] == 'application/mspowerpoint' || $regs[1] == 'application/vnd.ms-powerpoint') && $index_ppt == 0) {
$status['content'] = 'ppt';
$status['state'] = 'Indexing of PPT files is not activated in Admin Settings';
} else {
$status['state'] = "Not text or html";
$realnum -- ;
}
} else
if ($socket_status['timed_out'] == 1) {
$status['state'] = "Timed out (no reply from server)";
$realnum -- ;
} else
$status['state'] = "Not text or html";
}
}
fclose($fp);
unset ($urlparts, $answer);
return $status;
}
function check_robot_txt($url, $robots) {
global $user_agent;
$urlparts = parse_url($url);
$url = 'http://'.$urlparts['host']."/$robots";
$url_status = url_status($url);
$omit = array ();
if ($url_status['state'] == "ok") {
$robot = file($url);
if (!$robot) {
$contents = getFileContents($url);
$file = $contents['file'];
$robot = explode("\n", $file);
}
$regs = Array ();
$this_agent= "";
while (list ($id, $line) = each($robot)) {
if (eregi("^user-agent: *([^#]+) *", $line, $regs)) {
$this_agent = trim($regs[1]);
if ($this_agent == '*' || $this_agent == $user_agent)
$check = 1;
else
$check = 0;
}
if (eregi("disallow: *([^#]+)", $line, $regs) && $check == 1) {
$disallow_str = eregi_replace("[\n ]+", "", $regs[1]);
if (trim($disallow_str) != "") {
$omit[] = $disallow_str;
} else {
if ($this_agent == '*' || $this_agent == $user_agent) {
return null;
unset ($urlparts, $contents, $file, $robot, $regs);
}
}
}
}
}
unset ($urlparts, $contents, $file, $robot, $regs);
return $omit;
}
// Remove the file part from an url (to build an url from an url and given relative path)
function remove_file_from_url($url) {
$url_parts = parse_url($url);
$path = $url_parts['path'];
$regs = Array ();
if (preg_match('/([^\/]+)$/i', $path, $regs)) {
$file = $regs[1];
$check = $file.'$';
$path = preg_replace("/$check"."/i", "", $path);
}
if ($url_parts['port'] == 80 || $url_parts['port'] == "") {
$portq = "";
} else {
$portq = ":".$url_parts['port'];
}
$url = $url_parts['scheme']."://".$url_parts['host'].$portq.$path;
if ($clear == '1') {
unset ($url_parts, $regs, $file);
}
unset ($urlparts);
return $url;
}
// Extract links from html
function get_links($file, $url, $can_leave_domain, $base) {
$chunklist = array ();
// The base URL comes from either the meta tag or the current URL.
if (!empty($base)) {
$url = $base;
}
$links = array ();
$regs = Array ();
$checked_urls = Array();
$file = preg_replace("@@si", " ",$file);
preg_match_all("/href\s*=\s*[\'\"]?([+:%\/\?~=&;\\\(\),._a-zA-Z0-9-]*)(#[.a-zA-Z0-9-]*)?[\'\" ]?(\s*rel\s*=\s*[\'\"]?(nofollow)[\'\"]?)?/i", $file, $regs, PREG_SET_ORDER);
foreach ($regs as $val) {
if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') {
$links[] = $a;
}
$checked_urls[$val[1]] = 1;
}
}
preg_match_all("/(frame[^>]*src[[:blank:]]*)=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER);
foreach ($regs as $val) {
if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') {
$links[] = $a;
}
$checked_urls[$val[1]] = 1;
}
}
preg_match_all("/(window[.]location)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER);
foreach ($regs as $val) {
if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') {
$links[] = $a;
}
$checked_urls[$val[1]] = 1;
}
}
preg_match_all("/(http-equiv=['\"]refresh['\"] *content=['\"][0-9]+;url)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER);
foreach ($regs as $val) {
if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') {
$links[] = $a;
}
$checked_urls[$val[1]] = 1;
}
}
preg_match_all("/(window[.]open[[:blank:]]*[(])[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER);
foreach ($regs as $val) {
if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') {
$links[] = $a;
}
$checked_urls[$val[1]] = 1;
}
}
unset ($chunklist, $regs, $checked_urls);
return $links;
}
// Function to build a unique word array from the text of a webpage, together with the count of each word
function unique_array($arr) {
global $min_word_length, $common, $word_upper_bound;
global $index_numbers, $stem_words, $utf8, $case_sensitive;
if ($stem_words == 1) {
$newarr = Array();
foreach ($arr as $val) {
$newarr[] = stem($val);
}
$arr = $newarr;
}
sort($arr);
reset($arr);
$newarr = array ();
$i = 0;
$counter = 1;
if ($case_sensitive == '0') {
$element = lower_case(current($arr));
} else {
$element = current($arr);
}
if ($utf8 == '1') { // build array with utf8 support
if ($index_numbers == 0) {
$pattern = "/[0-9]+/";
} else {
$pattern = "/[ ]+/";
}
$regs = Array ();
for ($n = 0; $n < sizeof($arr); $n ++) {
//check if word is long enough, does not contain characters as defined in $pattern and is not a common word
//to eliminate/count multiple instance of words
$next_in_arr = next($arr);
if (strlen($next_in_arr) >= $min_word_length ) {
if ($next_in_arr != $element) {
if (strlen($element) >= $min_word_length && !preg_match($pattern, $element) && (@ $common[$element] <> 1)) {
if (preg_match("/^(-|\\\')(.*)/", $element, $regs))
$element = $regs[2];
if (preg_match("/(.*)(\\\'|-)$/", $element, $regs))
$element = $regs[1];
$newarr[$i][1] = $element;
$newarr[$i][2] = $counter;
if ($case_sensitive == '0') {
$element = lower_case(current($arr));
} else {
$element = current($arr);
}
$i ++;
$counter = 1;
} else {
$element = $next_in_arr;
}
} else {
if ($counter < $word_upper_bound)
$counter ++;
}
}
}
} else { // build array without utf8 support
if ($index_numbers == 1) {
$pattern = "/[a-z0-9]+/";
} else {
$pattern = "/[a-z]+/";
}
$pattern2 = "/[a-z0-9]+/"; // kill all non-alphanumerical characters
$regs = Array ();
for ($n = 0; $n < sizeof($arr); $n ++) {
//check if word is long enough, contains alphabetic characters and is not a common word
//to eliminate/count multiple instance of words
$next_in_arr = next($arr);
if (strlen($next_in_arr) >= $min_word_length ) {
if ($next_in_arr != $element) {
if (strlen($element) >= $min_word_length && preg_match($pattern, remove_accents($element)) && preg_match($pattern2, $element)&& (@ $common[$element] <> 1)) {
if (preg_match("/^(-|\\\')(.*)/", $element, $regs))
$element = $regs[2];
if (preg_match("/(.*)(\\\'|-)$/", $element, $regs))
$element = $regs[1];
//print "element1: $element
";
//$element = quote_replace($element);
//$element = htmlentities($element);
//$element = html_entity_encode($element);
//print "element2: $element
";
//$newarr[$i][1] = html_entity_decode($element); // Sphider-plus likes it pure
$newarr[$i][1] = $element;
$newarr[$i][2] = $counter;
if ($case_sensitive == '0') {
$element = lower_case(current($arr));
} else {
$element = current($arr);
}
$i ++;
$counter = 1;
} else {
$element = $next_in_arr;
}
} else {
if ($counter < $word_upper_bound)
$counter ++;
}
}
}
}
unset ($element, $arr);
//echo "
newArray:
";print_r($newarr);echo ""; return $newarr; } // Check if url is legal, relative to the main url. function url_purify($url, $parent_url, $can_leave_domain) { global $ext, $mainurl, $apache_indexes, $strip_sessids; $original_parent_url_parts = parse_url(); $urlparts = parse_url($url); $main_url_parts = parse_url($mainurl); if ($urlparts['host'] != "" && $urlparts['host'] != $main_url_parts['host'] && $can_leave_domain != 1) { return ''; } reset($ext); while (list ($id, $excl) = each($ext)) if (preg_match("/\.$excl$/i", $url)) return ''; if (substr($url, -1) == '\\') { return ''; } if (isset($urlparts['query'])) { if ($apache_indexes[$urlparts['query']]) { return ''; } } if (preg_match("/[\/]?mailto:|[\/]?javascript:|[\/]?news:/i", $url)) { return ''; } if (isset($urlparts['scheme'])) { $scheme = $urlparts['scheme']; } else { $scheme =""; } //only http and https links are followed if (!($scheme == 'http' || $scheme == '' || $scheme == 'https')) { return ''; } //parent url might be used to build an url from relative path $parent_url = remove_file_from_url($parent_url); $parent_url_parts = parse_url($parent_url); if (substr($url, 0, 1) == '/') { $url = $parent_url_parts['scheme']."://".$parent_url_parts['host'].$url; } else if (!isset($urlparts['scheme'])) { $url = $parent_url.$url; } $url_parts = parse_url($url); $urlpath = $url_parts['path']; $regs = Array (); while (preg_match("/[^\/]*\/[.]{2}\//", $urlpath, $regs)) { $urlpath = str_replace($regs[0], "", $urlpath); } //remove relative path instructions like ../ etc $urlpath = preg_replace("/\/+/", "/", $urlpath); $urlpath = preg_replace("/[^\/]*\/[.]{2}/", "", $urlpath); $urlpath = str_replace("./", "", $urlpath); $query = ""; if (isset($url_parts['query'])) { $query = "?".$url_parts['query']; } if ($main_url_parts['port'] == 80 || $url_parts['port'] == "") { $portq = ""; } else { $portq = ":".$main_url_parts['port']; } if (!$urlpath) $urlpath = "/"; // if not exists, add slash instead of real urlpath $url = $url_parts['scheme']."://".$url_parts['host'].$portq.$urlpath.$query; //added to address syntax if (strstr($url, "/?")) { $page = str_replace($main_url_parts['path'], null, $original_parent_url_parts['path']); if (substr(trim($mainurl), -1) !== "/" and substr(trim($page), 0, 1) !== "/") { $page = "/" . $page; } $url = $mainurl . $page . $query; } // if we index sub-domains if ($can_leave_domain == 1) { return $url; } $mainurl = remove_file_from_url($mainurl); if ($strip_sessids == 1) { $url = remove_sessid($url); } // only urls in staying in the starting domain/directory are followed $url = convert_url($url); if (strstr($url, $mainurl) == false) { unset ($ext, $mainurl, $original_parent_url_parts, $url_parts, $urlparts, $urlpath, $query, $page); return ''; } else { unset ($ext, $mainurl, $original_parent_url_parts, $url_parts, $urlparts, $urlpath, $query, $page); return $url; } } function save_keywords($wordarray, $link_id, $domain) { global $mysql_table_prefix, $all_keywords; reset($wordarray); while ($thisword = each($wordarray)) { $word = $thisword[1][1]; $wordmd5 = substr(md5($word), 0, 1); $hits = $thisword[1][2]; $weight = $thisword[1][3]; if (strlen($word)<= 255) { $keyword_id = $all_keywords[$word]; if ($keyword_id == "") { mysql_query("insert into ".$mysql_table_prefix."keywords (keyword) values ('$word')"); if (mysql_errno() == 1062) { $result = mysql_query("select keyword_ID from ".$mysql_table_prefix."keywords where keyword='$word'"); echo mysql_error(); $row = mysql_fetch_row($result); $keyword_id = $row[0]; clean_resource($result); } else{ $keyword_id = mysql_insert_id(); $all_keywords[$word] = $keyword_id; echo mysql_error(); } } $inserts[$wordmd5] .= ",($link_id, $keyword_id, $weight, $domain, $hits)"; } } for ($i=0;$i<=15; $i++) { $char = dechex($i); $values= substr($inserts[$char], 1); if ($values!="") { $query = "insert into ".$mysql_table_prefix."link_keyword$char (link_id, keyword_id, weight, domain, hits) values $values"; mysql_query($query); echo mysql_error(); } } unset ($values, $char, $inserts, $all_keywords, $weight, $word, $wordarray); } function get_head_data($file) { $headdata = ""; preg_match("@]*>(.*?)<\/head>@si",$file, $regs); $headdata = $regs[1]; $description = ""; $robots = ""; $keywords = ""; $base = ""; $res = Array (); if ($headdata != "") { preg_match("/'\"]+)[\"']?/i", $headdata, $res); if (isset ($res)) { $robots = $res[1]; } preg_match("/\"]+)[\"']?/i", $headdata, $res); if (isset ($res)) { $description = $res[1]; } preg_match("/\"]+)[\"']?/i", $headdata, $res); if (isset ($res)) { $keywords = $res[1]; } // e.g.