/**
* RiSearch PHP
*
* web search engine, version 0.1b
* (c) Sergej Tarasov, 2000-2002
*
* Homepage: http://risearch.org/
* email: risearch@risearch.org
* Last modified: 11.11.2002
*/
print "Start indexing
\n";
include "config.php";
#DEFINE CONSTANTS
$cfn = 0;
$cwn = 0;
$kbcount = 0;
$fp_FINFO = fopen ("$FINFO", "w");
fwrite($fp_FINFO, "\n");
$fp_SITEWORDS = fopen ("$SITEWORDS", "wb");
$fp_WORD_IND = fopen ("$WORD_IND", "wb");
$time1 = getmicrotime();
start_spidering();
$time2 = getmicrotime();
$time = $time2-$time1;
print "
Scan took $time sec.
";
print "Writing SITEWORDS\n";
$pos_sitewords = ftell($fp_SITEWORDS);
$pos_word_ind = ftell($fp_WORD_IND);
$to_print_sitewords = "";
$to_print_word_ind = "";
foreach($words as $word=>$value) {
$cwn++;
$words_word_dum = pack("NN",$pos_sitewords+strlen($to_print_sitewords),
$pos_word_ind+strlen($to_print_word_ind));
$to_print_sitewords .= "$word\x0A";
$to_print_word_ind .= pack("N",strlen($value)/4).$value;
$words[$word] = $words_word_dum;
if (strlen($to_print_word_ind) > 32000) {
fwrite($fp_SITEWORDS, $to_print_sitewords);
fwrite($fp_WORD_IND, $to_print_word_ind);
$to_print_sitewords = "";
$to_print_word_ind = "";
$pos_sitewords = ftell($fp_SITEWORDS);
$pos_word_ind = ftell($fp_WORD_IND);
}
}
fwrite($fp_SITEWORDS, $to_print_sitewords);
fwrite($fp_WORD_IND, $to_print_word_ind);
fclose($fp_SITEWORDS);
fclose($fp_WORD_IND);
print "Build hash\n";
build_hash();
print "$cfn files are indexed\n";
#=====================================================================
function start_spidering() {
global $start_url, $allow_url;
foreach ($start_url as $v) {
$to_visit[$v] = 1;
}
$visited = array();
do {
if (count($to_visit) == 0) {
break;
} else {
list ($url,) = each($to_visit);
}
print "Url: $url\n";
$fp = fopen($url,"r");
if ( $fp == FALSE ) {
print "Error in opening file: $url\n";
} else {
$text = "";
while (!feof ($fp)) {
$text .= fgets($fp, 4096);
}
}
$visited[$url] = 1;
print "$url - ".strlen($text)." bytes\n";
$base = $url;
if (preg_match_all("/]+)\\1/is", $text, $matches,PREG_SET_ORDER)) {
$base = $matches[0][2];
}
$links = get_link($text);
foreach ($links as $k => $v) {
$new_link = get_absolute_url($base,$k);
$new_link = preg_replace("/#.*/","",$new_link);
$new_link_stripped = preg_replace("/\?.*/","",$new_link);
if ( check_url($new_link_stripped)) {
if ( ! array_key_exists($new_link,$visited)) {
$to_visit[$new_link] = 1;
}
}
}
index_file($text,$url);
unset($to_visit[$url]);
} while (1);
}
#=====================================================================
function index_file($html_text,$url) {
global $cfn, $kbcount, $descr_size, $min_length, $stop_words_array, $use_esc;
global $use_selective_indexing, $no_index_strings;
global $use_META, $use_META_descr;
global $fp_FINFO;
global $words;
$cfn++;
$size = strlen($html_text);
$kbcount += intval($size/1024);
print "$cfn -> $url; totalsize -> $kbcount kb
\n";
# Delete parts of document, which should not be indexed
if ($use_selective_indexing == "YES") {
foreach ($no_index_strings as $k => $v) {
$html_text = preg_replace("/$k.*?$v/s"," ",$html_text);
}
}
preg_match("/
\s*(.*?)\s*<\/title>/i",$html_text,$matches);
$title = $matches[1];
preg_replace("/\s+/"," ",$title);
if ($title == "") { $title = "No title"; }
$keywords = "";
$description = "";
if ($use_META == "YES") {
$res = get_META_info($html_text);
$keywords = $res[0];
$description = $res[1];
}
$html_text = preg_replace("//s"," ",$html_text);
$html_text = preg_replace("/<[Ss][Cc][Rr][Ii][Pp][Tt].*?<\/[Ss][Cc][Rr][Ii][Pp][Tt]>/s"," ",$html_text);
$html_text = preg_replace("/<[Ss][Tt][Yy][Ll][Ee].*?<\/[Ss][Tt][Yy][Ll][Ee]>/s"," ",$html_text);
$html_text = preg_replace("/<[^>]*>/s"," ",$html_text);
if ($use_esc == "YES") { $html_text = preg_replace_callback("/&[a-zA-Z0-9#]*?;/", 'esc2char', $html_text); }
if (($use_META_descr == "YES") & ($description != "")) {
$descript = substr($description,0,$descr_size);
} else {
$html_text = preg_replace("/\s+/s"," ",$html_text);
$descript = substr($html_text,0,$descr_size);
}
$html_text = $html_text." ".$keywords." ".$description;
$html_text = preg_replace("/[^a-zA-Zà-ÿÀ-ß$numbers ]/"," ",$html_text);
$html_text = preg_replace("/\s+/s"," ",$html_text);
$html_text = strtolower($html_text);
$words_temp = array();
$pos = 0;
do {
$new_pos = strpos($html_text," ",$pos);
if ($new_pos === FALSE) {
$word = substr($html_text,$pos);
$words_temp[$word] = 1;
break;
};
$word = substr($html_text,$pos,$new_pos-$pos);
$words_temp[$word] = 1;
$pos = $new_pos+1;
} while (1>0);
$pos = ftell($fp_FINFO);
$pos = pack("N",$pos);
fwrite($fp_FINFO, "$url::$size::$title::$descript\n");
foreach($words_temp as $word => $val) {
if (strlen($word) < $min_length) { continue; }
if (array_key_exists($word,$stop_words_array)) { continue; }
$words[$word] .= $pos;
}
unset($words_temp);
unset($words_temp2);
}
#=====================================================================
function build_hash() {
global $words;
global $HASHSIZE, $INDEXING_SCHEME, $HASH, $HASHWORDS;
for ($i=0; $i<$HASHSIZE; $i++) {$hash_array[$i] = "";};
foreach($words as $word=>$value) {
if ($INDEXING_SCHEME == 3) { $subbound = strlen($word)-3; }
else { $subbound = 1; }
if (strlen($word)==3) {$subbound = 1;}
$substring_length = 4;
if ($INDEXING_SCHEME == 1) { $substring_length = strlen($word); }
for ($i=0; $i<$subbound; $i++){
$hash_value = abs(hash(substr($word,$i,$substring_length)) % $HASHSIZE);
$hash_array[$hash_value] .= $value;
};
}
$fp_HASH = fopen ("$HASH", "wb");
$fp_HASHWORDS = fopen ("$HASHWORDS", "wb");
$zzz = pack("N", 0);
fwrite($fp_HASHWORDS, $zzz);
$pos_hashwords = ftell($fp_HASHWORDS);
$to_print_hash = "";
$to_print_hashwords = "";
for ($i=0; $i<$HASHSIZE; $i++){
if ($hash_array[$i] == "") {$to_print_hash .= $zzz;};
if ($hash_array[$i] != "") {
$to_print_hash .= pack("N",$pos_hashwords + strlen($to_print_hashwords));
$to_print_hashwords .= pack("N", strlen($hash_array[$i])/8).$hash_array[$i];
};
if (strlen($to_print_hashwords) > 64000) {
fwrite($fp_HASH,$to_print_hash);
fwrite($fp_HASHWORDS,$to_print_hashwords);
$to_print_hash = "";
$to_print_hashwords = "";
$pos_hashwords = ftell($fp_HASHWORDS);
}
}; # for $i
fwrite($fp_HASH,$to_print_hash);
fwrite($fp_HASHWORDS,$to_print_hashwords);
fclose($fp_HASH);
fclose($fp_HASHWORDS);
}
#=====================================================================
function hash($key) {
$chars = preg_split("//",$key);
for($i=1;$i> 24; };
$h &= ~$g;
}
return $h;
}
#===================================================================
function getmicrotime(){
list($usec, $sec) = explode(" ",microtime());
return ((float)$usec + (float)$sec);
}
#=====================================================================
function get_link($text) {
$links = array();
$count = preg_match_all("/]+href=([\"']?)([^\\s\"'>]+)\\1/is", $text, $matches, PREG_SET_ORDER);
for($i=0; $i < count($matches); $i++) {
$links[$matches[$i][2]] = 1;
}
$count = preg_match_all("/]+src=([\"']?)([^\\s\"'>]+)\\1/is", $text, $matches, PREG_SET_ORDER);
for($i=0; $i < count($matches); $i++) {
$links[$matches[$i][2]] = 1;
}
$count = preg_match_all("/]+href=([\"']?)([^\\s\"'>]+)\\1/is", $text, $matches, PREG_SET_ORDER);
for($i=0; $i < count($matches); $i++) {
$links[$matches[$i][2]] = 1;
}
return $links;
}
#=====================================================================
function get_absolute_url($base,$url) {
$url_arr = parse_url($url);
if (isset($url_arr["scheme"])) {
return($url);
}
$base_arr = parse_url($base);
$base_base = strtolower($base_arr["scheme"])."://";
if (isset($base_arr["user"])) {
$base_base .= $base_arr["user"].":".$base_arr["pass"]."@";
}
$base_base .= strtolower($base_arr["host"]);
if (isset($base_arr["port"])) {
$base_base .= ":".$base_arr["port"];
}
$base_path = $base_arr["path"];
if ($base_path == "") { $base_path = "/"; }
$base_path = preg_replace("/(.*\/).*/","\\1",$base_path);
if ($url_arr["path"][0] == "/") {
return $base_base.$url;
}
if (preg_match("'^\./'",$url)) {
$url = preg_replace("'^\./'","",$url);
return $base_base.$base_path.$url;
}
while (preg_match("'^\.\./'",$url)) {
$url = preg_replace("'^\.\./'","",$url);
$base_path = preg_replace("/(.*\/).*\//","\\1",$base_path);
}
return $base_base.$base_path.$url;
}
#=====================================================================
function check_url($url) {
global $file_ext, $no_index_files, $no_index_dir, $allow_url;
if ( ! preg_match("'^http://'",$url)) { return FALSE; }
if ( ! preg_match ("'$file_ext'i", $url)) { return FALSE; }
if ( preg_match ("'$no_index_files'i", $url)) { return FALSE; }
if ( preg_match ("'$no_index_dir'i", $url)) { return FALSE; }
$allow = 0;
foreach ($allow_url as $v) {
if ( preg_match("'$v'i", $url)) {
$allow = 1;
break;
}
}
if ($allow == 0) { return FALSE; }
return TRUE;
}
#=====================================================================
function get_META_info($html) {
preg_match("/<[Mm][Ee][Tt][Aa]\s*[Nn][Aa][Mm][Ee]=\"?[Kk][Ee][Yy][Ww][Oo][Rr][Dd][Ss]\"?\s*[Cc][Oo][Nn][Tt][Ee][Nn][Tt]=\"?([^\"]*)\"?>/",$html,$matches);
$res[0] = $matches[1];
preg_match("/<[Mm][Ee][Tt][Aa]\s*[Nn][Aa][Mm][Ee]=\"?[Dd][Ee][Ss][Cc][Rr][Ii][Pp][Tt][Ii][Oo][Nn]\"?\s*[Cc][Oo][Nn][Tt][Ee][Nn][Tt]=\"?([^\"]*)\"?>/",$html,$matches);
$res[1] = $matches[1];
return $res;
}
#=====================================================================
?>