\n"; include "config.php"; #DEFINE CONSTANTS $cfn = 0; $cwn = 0; $kbcount = 0; $fp_FINFO = fopen ("$FINFO", "w"); fwrite($fp_FINFO, "\n"); $fp_SITEWORDS = fopen ("$SITEWORDS", "wb"); $fp_WORD_IND = fopen ("$WORD_IND", "wb"); $time1 = getmicrotime(); start_spidering(); $time2 = getmicrotime(); $time = $time2-$time1; print "
Scan took $time sec.
"; print "Writing SITEWORDS\n"; $pos_sitewords = ftell($fp_SITEWORDS); $pos_word_ind = ftell($fp_WORD_IND); $to_print_sitewords = ""; $to_print_word_ind = ""; foreach($words as $word=>$value) { $cwn++; $words_word_dum = pack("NN",$pos_sitewords+strlen($to_print_sitewords), $pos_word_ind+strlen($to_print_word_ind)); $to_print_sitewords .= "$word\x0A"; $to_print_word_ind .= pack("N",strlen($value)/4).$value; $words[$word] = $words_word_dum; if (strlen($to_print_word_ind) > 32000) { fwrite($fp_SITEWORDS, $to_print_sitewords); fwrite($fp_WORD_IND, $to_print_word_ind); $to_print_sitewords = ""; $to_print_word_ind = ""; $pos_sitewords = ftell($fp_SITEWORDS); $pos_word_ind = ftell($fp_WORD_IND); } } fwrite($fp_SITEWORDS, $to_print_sitewords); fwrite($fp_WORD_IND, $to_print_word_ind); fclose($fp_SITEWORDS); fclose($fp_WORD_IND); print "Build hash\n"; build_hash(); print "$cfn files are indexed\n"; #===================================================================== function start_spidering() { global $start_url, $allow_url; foreach ($start_url as $v) { $to_visit[$v] = 1; } $visited = array(); do { if (count($to_visit) == 0) { break; } else { list ($url,) = each($to_visit); } print "Url: $url\n"; $fp = fopen($url,"r"); if ( $fp == FALSE ) { print "Error in opening file: $url\n"; } else { $text = ""; while (!feof ($fp)) { $text .= fgets($fp, 4096); } } $visited[$url] = 1; print "$url - ".strlen($text)." bytes\n"; $base = $url; if (preg_match_all("/]+)\\1/is", $text, $matches,PREG_SET_ORDER)) { $base = $matches[0][2]; } $links = get_link($text); foreach ($links as $k => $v) { $new_link = get_absolute_url($base,$k); $new_link = preg_replace("/#.*/","",$new_link); $new_link_stripped = preg_replace("/\?.*/","",$new_link); if ( check_url($new_link_stripped)) { if ( ! array_key_exists($new_link,$visited)) { $to_visit[$new_link] = 1; } } } index_file($text,$url); unset($to_visit[$url]); } while (1); } #===================================================================== function index_file($html_text,$url) { global $cfn, $kbcount, $descr_size, $min_length, $stop_words_array, $use_esc; global $use_selective_indexing, $no_index_strings; global $use_META, $use_META_descr; global $fp_FINFO; global $words; $cfn++; $size = strlen($html_text); $kbcount += intval($size/1024); print "$cfn -> $url; totalsize -> $kbcount kb
\n"; # Delete parts of document, which should not be indexed if ($use_selective_indexing == "YES") { foreach ($no_index_strings as $k => $v) { $html_text = preg_replace("/$k.*?$v/s"," ",$html_text); } } preg_match("/\s*(.*?)\s*<\/title>/i",$html_text,$matches); $title = $matches[1]; preg_replace("/\s+/"," ",$title); if ($title == "") { $title = "No title"; } $keywords = ""; $description = ""; if ($use_META == "YES") { $res = get_META_info($html_text); $keywords = $res[0]; $description = $res[1]; } $html_text = preg_replace("/<!--.*?-->/s"," ",$html_text); $html_text = preg_replace("/<[Ss][Cc][Rr][Ii][Pp][Tt].*?<\/[Ss][Cc][Rr][Ii][Pp][Tt]>/s"," ",$html_text); $html_text = preg_replace("/<[Ss][Tt][Yy][Ll][Ee].*?<\/[Ss][Tt][Yy][Ll][Ee]>/s"," ",$html_text); $html_text = preg_replace("/<[^>]*>/s"," ",$html_text); if ($use_esc == "YES") { $html_text = preg_replace_callback("/&[a-zA-Z0-9#]*?;/", 'esc2char', $html_text); } if (($use_META_descr == "YES") & ($description != "")) { $descript = substr($description,0,$descr_size); } else { $html_text = preg_replace("/\s+/s"," ",$html_text); $descript = substr($html_text,0,$descr_size); } $html_text = $html_text." ".$keywords." ".$description; $html_text = preg_replace("/[^a-zA-Zà-ÿÀ-ß$numbers ]/"," ",$html_text); $html_text = preg_replace("/\s+/s"," ",$html_text); $html_text = strtolower($html_text); $words_temp = array(); $pos = 0; do { $new_pos = strpos($html_text," ",$pos); if ($new_pos === FALSE) { $word = substr($html_text,$pos); $words_temp[$word] = 1; break; }; $word = substr($html_text,$pos,$new_pos-$pos); $words_temp[$word] = 1; $pos = $new_pos+1; } while (1>0); $pos = ftell($fp_FINFO); $pos = pack("N",$pos); fwrite($fp_FINFO, "$url::$size::$title::$descript\n"); foreach($words_temp as $word => $val) { if (strlen($word) < $min_length) { continue; } if (array_key_exists($word,$stop_words_array)) { continue; } $words[$word] .= $pos; } unset($words_temp); unset($words_temp2); } #===================================================================== function build_hash() { global $words; global $HASHSIZE, $INDEXING_SCHEME, $HASH, $HASHWORDS; for ($i=0; $i<$HASHSIZE; $i++) {$hash_array[$i] = "";}; foreach($words as $word=>$value) { if ($INDEXING_SCHEME == 3) { $subbound = strlen($word)-3; } else { $subbound = 1; } if (strlen($word)==3) {$subbound = 1;} $substring_length = 4; if ($INDEXING_SCHEME == 1) { $substring_length = strlen($word); } for ($i=0; $i<$subbound; $i++){ $hash_value = abs(hash(substr($word,$i,$substring_length)) % $HASHSIZE); $hash_array[$hash_value] .= $value; }; } $fp_HASH = fopen ("$HASH", "wb"); $fp_HASHWORDS = fopen ("$HASHWORDS", "wb"); $zzz = pack("N", 0); fwrite($fp_HASHWORDS, $zzz); $pos_hashwords = ftell($fp_HASHWORDS); $to_print_hash = ""; $to_print_hashwords = ""; for ($i=0; $i<$HASHSIZE; $i++){ if ($hash_array[$i] == "") {$to_print_hash .= $zzz;}; if ($hash_array[$i] != "") { $to_print_hash .= pack("N",$pos_hashwords + strlen($to_print_hashwords)); $to_print_hashwords .= pack("N", strlen($hash_array[$i])/8).$hash_array[$i]; }; if (strlen($to_print_hashwords) > 64000) { fwrite($fp_HASH,$to_print_hash); fwrite($fp_HASHWORDS,$to_print_hashwords); $to_print_hash = ""; $to_print_hashwords = ""; $pos_hashwords = ftell($fp_HASHWORDS); } }; # for $i fwrite($fp_HASH,$to_print_hash); fwrite($fp_HASHWORDS,$to_print_hashwords); fclose($fp_HASH); fclose($fp_HASHWORDS); } #===================================================================== function hash($key) { $chars = preg_split("//",$key); for($i=1;$i<count($chars)-1;$i++) { $chars2[$i] = ord($chars[$i]); } $h = hexdec("00000000"); $f = hexdec("F0000000"); for($i=1;$i<count($chars)-1;$i++) { $h = ($h << 4) + $chars2[$i]; if ($g = $h & $f) { $h ^= $g >> 24; }; $h &= ~$g; } return $h; } #=================================================================== function getmicrotime(){ list($usec, $sec) = explode(" ",microtime()); return ((float)$usec + (float)$sec); } #===================================================================== function get_link($text) { $links = array(); $count = preg_match_all("/<a[^>]+href=([\"']?)([^\\s\"'>]+)\\1/is", $text, $matches, PREG_SET_ORDER); for($i=0; $i < count($matches); $i++) { $links[$matches[$i][2]] = 1; } $count = preg_match_all("/<frame[^>]+src=([\"']?)([^\\s\"'>]+)\\1/is", $text, $matches, PREG_SET_ORDER); for($i=0; $i < count($matches); $i++) { $links[$matches[$i][2]] = 1; } $count = preg_match_all("/<area[^>]+href=([\"']?)([^\\s\"'>]+)\\1/is", $text, $matches, PREG_SET_ORDER); for($i=0; $i < count($matches); $i++) { $links[$matches[$i][2]] = 1; } return $links; } #===================================================================== function get_absolute_url($base,$url) { $url_arr = parse_url($url); if (isset($url_arr["scheme"])) { return($url); } $base_arr = parse_url($base); $base_base = strtolower($base_arr["scheme"])."://"; if (isset($base_arr["user"])) { $base_base .= $base_arr["user"].":".$base_arr["pass"]."@"; } $base_base .= strtolower($base_arr["host"]); if (isset($base_arr["port"])) { $base_base .= ":".$base_arr["port"]; } $base_path = $base_arr["path"]; if ($base_path == "") { $base_path = "/"; } $base_path = preg_replace("/(.*\/).*/","\\1",$base_path); if ($url_arr["path"][0] == "/") { return $base_base.$url; } if (preg_match("'^\./'",$url)) { $url = preg_replace("'^\./'","",$url); return $base_base.$base_path.$url; } while (preg_match("'^\.\./'",$url)) { $url = preg_replace("'^\.\./'","",$url); $base_path = preg_replace("/(.*\/).*\//","\\1",$base_path); } return $base_base.$base_path.$url; } #===================================================================== function check_url($url) { global $file_ext, $no_index_files, $no_index_dir, $allow_url; if ( ! preg_match("'^http://'",$url)) { return FALSE; } if ( ! preg_match ("'$file_ext'i", $url)) { return FALSE; } if ( preg_match ("'$no_index_files'i", $url)) { return FALSE; } if ( preg_match ("'$no_index_dir'i", $url)) { return FALSE; } $allow = 0; foreach ($allow_url as $v) { if ( preg_match("'$v'i", $url)) { $allow = 1; break; } } if ($allow == 0) { return FALSE; } return TRUE; } #===================================================================== function get_META_info($html) { preg_match("/<[Mm][Ee][Tt][Aa]\s*[Nn][Aa][Mm][Ee]=\"?[Kk][Ee][Yy][Ww][Oo][Rr][Dd][Ss]\"?\s*[Cc][Oo][Nn][Tt][Ee][Nn][Tt]=\"?([^\"]*)\"?>/",$html,$matches); $res[0] = $matches[1]; preg_match("/<[Mm][Ee][Tt][Aa]\s*[Nn][Aa][Mm][Ee]=\"?[Dd][Ee][Ss][Cc][Rr][Ii][Pp][Tt][Ii][Oo][Nn]\"?\s*[Cc][Oo][Nn][Tt][Ee][Nn][Tt]=\"?([^\"]*)\"?>/",$html,$matches); $res[1] = $matches[1]; return $res; } #===================================================================== ?>