/**
* RiSearch PHP
*
* web search engine, version 0.1b
* (c) Sergej Tarasov, 2000-2002
*
* Homepage: http://risearch.org/
* email: risearch@risearch.org
* Last modified: 11.11.2002
*/
include "config.php";
$INDEXING_SCHEME = 1;
Change_DB($INDEXING_SCHEME);
Beginning();
build_hash();
print "$cfn files are indexed\n";
$INDEXING_SCHEME = 2;
Change_DB($INDEXING_SCHEME);
Beginning();
build_hash();
print "$cfn files are indexed\n";
$INDEXING_SCHEME = 3;
Change_DB($INDEXING_SCHEME);
Beginning();
build_hash();
print "$cfn files are indexed\n";
function Change_DB($mode) {
global $HASH, $HASHWORDS, $FINFO, $SITEWORDS, $WORD_IND;
$mode = (string)$mode;
$HASH = $mode . "db/0_hash";
$HASHWORDS = $mode . "db/0_hashwords";
$FINFO = $mode . "db/0_finfo";
$SITEWORDS = $mode . "db/0_sitewords";
$WORD_IND = $mode . "db/0_word_ind";
}
function Beginning() {
global $HASH, $HASHWORDS, $FINFO, $SITEWORDS, $WORD_IND;
global $base_dir, $base_url, $cfn;
global $no_index_dir, $file_ext, $cut_default_filenames, $default_filenames, $url_to_lower_case, $no_index_files;
global $kbcount, $descr_size, $min_length, $stop_words_array, $use_esc;
global $use_selective_indexing, $no_index_strings;
global $use_META, $use_META_descr;
global $fp_FINFO;
global $words;
global $HASHSIZE, $INDEXING_SCHEME, $HASH, $HASHWORDS;
print "Start indexing
\n";
print "\$base_dir: $base_dir
\n";
print "\$base_url: $base_url
\n";
#DEFINE CONSTANTS
$cfn = 0;
$cwn = 0;
$kbcount = 0;
$fp_FINFO = fopen ("$FINFO", "w");
fwrite($fp_FINFO, "\n");
$fp_SITEWORDS = fopen ("$SITEWORDS", "wb");
$fp_WORD_IND = fopen ("$WORD_IND", "wb");
$time1 = getmicrotime();
scan_files($base_dir);
$time2 = getmicrotime();
$time = $time2-$time1;
print "
Scan took $time sec.
";
print "Writing SITEWORDS
";
$pos_sitewords = ftell($fp_SITEWORDS);
$pos_word_ind = ftell($fp_WORD_IND);
$to_print_sitewords = "";
$to_print_word_ind = "";
foreach($words as $word=>$value) {
$cwn++;
$words_word_dum = pack("NN",$pos_sitewords+strlen($to_print_sitewords),
$pos_word_ind+strlen($to_print_word_ind));
$to_print_sitewords .= "$word\x0A";
$to_print_word_ind .= pack("N",strlen($value)/4).$value;
$words[$word] = $words_word_dum;
if (strlen($to_print_word_ind) > 32000) {
fwrite($fp_SITEWORDS, $to_print_sitewords);
fwrite($fp_WORD_IND, $to_print_word_ind);
$to_print_sitewords = "";
$to_print_word_ind = "";
$pos_sitewords = ftell($fp_SITEWORDS);
$pos_word_ind = ftell($fp_WORD_IND);
}
}
fwrite($fp_SITEWORDS, $to_print_sitewords);
fwrite($fp_WORD_IND, $to_print_word_ind);
fclose($fp_SITEWORDS);
fclose($fp_WORD_IND);
print "Build hash\n";
}
#=====================================================================
function scan_files ($dir) {
global $base_dir, $base_url, $cfn;
global $no_index_dir, $file_ext, $cut_default_filenames, $default_filenames, $url_to_lower_case, $no_index_files;
$dir_h = opendir($dir);
while (false !== ($file = readdir($dir_h))) {
if ($file != "." && $file != "..") {
$new_dir = $dir."/".$file;
if ( is_dir($new_dir)) {
if (preg_match ("'$no_index_dir'i", $new_dir)) { continue; }
scan_files($new_dir);
} else {
if (preg_match ("'$file_ext'i", $new_dir)) {
$url = preg_replace ("'^$base_dir/'", "$base_url", $new_dir);
if (preg_match ("'$no_index_files'i", $url)) { continue; };
if ($cut_default_filenames == "YES") {
$url = preg_replace ("'$default_filenames'i", "/", $url);
}
if ($url_to_lower_case == "YES") {
$url = strtolower($url);
}
index_file($new_dir,$url);
$cfn++;
}
}
}
}
closedir($dir_h);
}
#=====================================================================
function index_file($new_dir,$url) {
global $cfn, $kbcount, $descr_size, $min_length, $stop_words_array, $use_esc;
global $use_selective_indexing, $no_index_strings;
global $use_META, $use_META_descr;
global $fp_FINFO;
global $words;
$size = filesize($new_dir);
$modify = date("F d Y H:i:s.", filemtime($new_dir));
$kbcount += intval($size/1024);
print "$cfn -> $new_dir; totalsize -> $kbcount Kb
\n";
$fd = fopen ($new_dir, "rb");
$html_text = fread ($fd, $size);
fclose ($fd);
# Delete parts of document, which should not be indexed
if ($use_selective_indexing == "YES") {
foreach ($no_index_strings as $k => $v) {
$html_text = preg_replace("/$k.*?$v/s"," ",$html_text);
}
}
//$ns = array("/\r\n/s","/\n/s","/\r/s");
$html_text = preg_replace("/\s+/s", " ", $html_text);
$html_text = preg_replace("/\s*\r\n\s*/s", "\r\n", $html_text);
$html_text = preg_replace("#\r\n+#s", "\r\n", $html_text);
preg_match("#