\n"; print "\$base_dir: $base_dir
\n"; print "\$base_url: $base_url
\n"; #DEFINE CONSTANTS $cfn = 0; $cwn = 0; $kbcount = 0; $fp_FINFO = fopen ("$FINFO", "w"); fwrite($fp_FINFO, "\n"); $fp_SITEWORDS = fopen ("$SITEWORDS", "wb"); $fp_WORD_IND = fopen ("$WORD_IND", "wb"); $time1 = getmicrotime(); scan_files($base_dir); $time2 = getmicrotime(); $time = $time2-$time1; print "
Scan took $time sec.
"; print "Writing SITEWORDS
"; $pos_sitewords = ftell($fp_SITEWORDS); $pos_word_ind = ftell($fp_WORD_IND); $to_print_sitewords = ""; $to_print_word_ind = ""; foreach($words as $word=>$value) { $cwn++; $words_word_dum = pack("NN",$pos_sitewords+strlen($to_print_sitewords), $pos_word_ind+strlen($to_print_word_ind)); $to_print_sitewords .= "$word\x0A"; $to_print_word_ind .= pack("N",strlen($value)/4).$value; $words[$word] = $words_word_dum; if (strlen($to_print_word_ind) > 32000) { fwrite($fp_SITEWORDS, $to_print_sitewords); fwrite($fp_WORD_IND, $to_print_word_ind); $to_print_sitewords = ""; $to_print_word_ind = ""; $pos_sitewords = ftell($fp_SITEWORDS); $pos_word_ind = ftell($fp_WORD_IND); } } fwrite($fp_SITEWORDS, $to_print_sitewords); fwrite($fp_WORD_IND, $to_print_word_ind); fclose($fp_SITEWORDS); fclose($fp_WORD_IND); print "Build hash\n"; } #===================================================================== function scan_files ($dir) { global $base_dir, $base_url, $cfn; global $no_index_dir, $file_ext, $cut_default_filenames, $default_filenames, $url_to_lower_case, $no_index_files; $dir_h = opendir($dir); while (false !== ($file = readdir($dir_h))) { if ($file != "." && $file != "..") { $new_dir = $dir."/".$file; if ( is_dir($new_dir)) { if (preg_match ("'$no_index_dir'i", $new_dir)) { continue; } scan_files($new_dir); } else { if (preg_match ("'$file_ext'i", $new_dir)) { $url = preg_replace ("'^$base_dir/'", "$base_url", $new_dir); if (preg_match ("'$no_index_files'i", $url)) { continue; }; if ($cut_default_filenames == "YES") { $url = preg_replace ("'$default_filenames'i", "/", $url); } if ($url_to_lower_case == "YES") { $url = strtolower($url); } index_file($new_dir,$url); $cfn++; } } } } closedir($dir_h); } #===================================================================== function index_file($new_dir,$url) { global $cfn, $kbcount, $descr_size, $min_length, $stop_words_array, $use_esc; global $use_selective_indexing, $no_index_strings; global $use_META, $use_META_descr; global $fp_FINFO; global $words; $size = filesize($new_dir); $modify = date("F d Y H:i:s.", filemtime($new_dir)); $kbcount += intval($size/1024); print "$cfn -> $new_dir; totalsize -> $kbcount Kb
\n"; $fd = fopen ($new_dir, "rb"); $html_text = fread ($fd, $size); fclose ($fd); # Delete parts of document, which should not be indexed if ($use_selective_indexing == "YES") { foreach ($no_index_strings as $k => $v) { $html_text = preg_replace("/$k.*?$v/s"," ",$html_text); } } //$ns = array("/\r\n/s","/\n/s","/\r/s"); $html_text = preg_replace("/\s+/s", " ", $html_text); $html_text = preg_replace("/\s*\r\n\s*/s", "\r\n", $html_text); $html_text = preg_replace("#\r\n+#s", "\r\n", $html_text); preg_match("#\s*(.*?)\s*(.+)#is",$html_text,$matches); $title = $matches[1]; $html_text = $matches[2]; $title = preg_replace("#\r\n#s", " ", $title); $title = preg_replace("/\s+/", " ", $title); if ($title == "") { $title = "No title"; } $keywords = ""; $description = ""; if ($use_META == "YES") { $res = get_META_info($html_text); $keywords = $res[0]; $description = $res[1]; } $html_text = preg_replace("//s"," ",$html_text); $html_text = preg_replace("/<[Ss][Cc][Rr][Ii][Pp][Tt].*?<\/[Ss][Cc][Rr][Ii][Pp][Tt]>/s"," ",$html_text); $html_text = preg_replace("/<[Ss][Tt][Yy][Ll][Ee].*?<\/[Ss][Tt][Yy][Ll][Ee]>/s"," ",$html_text); $html_text = preg_replace("/<[^>]*>/s"," ",$html_text); if ($use_esc == "YES") { $html_text = preg_replace_callback("/&[a-zA-Z0-9#]*?;/", 'esc2char', $html_text); } if (($use_META_descr == "YES") & ($description != "")) { $descript = substr($description,0,$descr_size); } else { $html_text = preg_replace("/\s+/s"," ",$html_text); $descript = preg_replace("/\s+/s", " ", $html_text); $title = preg_replace("/\s+/", " ", $title); $descript = preg_replace("#$title#si", "", $descript ); $descript = wordwrap($descript, 130, ":`:"); // $descript = substr($descript,0,$descr_size); } $html_text = $html_text." ".$keywords." ".$description; $html_text = preg_replace("/[^a-zA-Zà-ÿÀ-ß$numbers ]/"," ",$html_text); $html_text = preg_replace("/\s+/s"," ",$html_text); /*Comment*/ $html_text = strtolower($html_text); $punct = array("#,#", "#\.#", "#;#", "#!#","#:#", "#\?#", "#\"#", "#\(#", "#\)#"); $html_text = preg_replace($punct, " ", $html_text); $words_temp = array(); $pos = 0; do { $new_pos = strpos($html_text," ",$pos); if ($new_pos === FALSE) { $word = substr($html_text,$pos); $words_temp[$word] = 1; break; }; $word = substr($html_text,$pos,$new_pos-$pos); $words_temp[$word] = 1; $pos = $new_pos+1; } while (1>0); $pos = ftell($fp_FINFO); $pos = pack("N",$pos); fwrite($fp_FINFO, "$url::$size::$modify::$title::$descript\n"); foreach($words_temp as $word => $val) { if (strlen($word) < $min_length) { continue; } if (array_key_exists($word,$stop_words_array)) { continue; } $words[$word] .= $pos; } unset($words_temp); unset($words_temp2); } #===================================================================== function build_hash() { global $words; global $HASHSIZE, $INDEXING_SCHEME, $HASH, $HASHWORDS; for ($i=0; $i<$HASHSIZE; $i++) {$hash_array[$i] = "";}; foreach($words as $word=>$value) { if ($INDEXING_SCHEME == 3) { $subbound = strlen($word)-3; } else { $subbound = 1; } if (strlen($word)==3) {$subbound = 1;} $substring_length = 4; if ($INDEXING_SCHEME == 1) { $substring_length = strlen($word); } for ($i=0; $i<$subbound; $i++){ $hash_value = abs(a_hash(substr($word,$i,$substring_length)) % $HASHSIZE); $hash_array[$hash_value] .= $value; }; } $fp_HASH = fopen ("$HASH", "wb"); $fp_HASHWORDS = fopen ("$HASHWORDS", "wb"); $zzz = pack("N", 0); fwrite($fp_HASHWORDS, $zzz); $pos_hashwords = ftell($fp_HASHWORDS); $to_print_hash = ""; $to_print_hashwords = ""; for ($i=0; $i<$HASHSIZE; $i++){ if ($hash_array[$i] == "") {$to_print_hash .= $zzz;}; if ($hash_array[$i] != "") { $to_print_hash .= pack("N",$pos_hashwords + strlen($to_print_hashwords)); $to_print_hashwords .= pack("N", strlen($hash_array[$i])/8).$hash_array[$i]; }; if (strlen($to_print_hashwords) > 64000) { fwrite($fp_HASH,$to_print_hash); fwrite($fp_HASHWORDS,$to_print_hashwords); $to_print_hash = ""; $to_print_hashwords = ""; $pos_hashwords = ftell($fp_HASHWORDS); } }; # for $i fwrite($fp_HASH,$to_print_hash); fwrite($fp_HASHWORDS,$to_print_hashwords); fclose($fp_HASH); fclose($fp_HASHWORDS); } #===================================================================== function a_hash($key) { $chars = preg_split("//",$key); for($i=1;$i> 24; }; $h &= ~$g; } return $h; } #=================================================================== function getmicrotime(){ list($usec, $sec) = explode(" ",microtime()); return ((float)$usec + (float)$sec); } #===================================================================== function get_META_info($html) { preg_match("/<[Mm][Ee][Tt][Aa]\s*[Nn][Aa][Mm][Ee]=\"?[Kk][Ee][Yy][Ww][Oo][Rr][Dd][Ss]\"?\s*[Cc][Oo][Nn][Tt][Ee][Nn][Tt]=\"?([^\"]*)\"?>/",$html,$matches); $res[0] = $matches[1]; preg_match("/<[Mm][Ee][Tt][Aa]\s*[Nn][Aa][Mm][Ee]=\"?[Dd][Ee][Ss][Cc][Rr][Ii][Pp][Tt][Ii][Oo][Nn]\"?\s*[Cc][Oo][Nn][Tt][Ee][Nn][Tt]=\"?([^\"]*)\"?>/",$html,$matches); $res[1] = $matches[1]; return $res; } #===================================================================== ?>