|
HOW TO: Add a "does not include" filter for page content April 13, 2010 10:31PM |
Registered: 3 years ago Posts: 3 |
Index: admin/admin.php
===================================================================
--- admin.php (revision 13242)
+++ admin.php (revision 13243)
@@ -271,7 +271,7 @@
function editsiteform($site_id) {
global $mysql_table_prefix;
- $result = mysql_query("SELECT site_id, url, title, short_desc, spider_depth, required, disallowed, can_leave_domain from ".$mysql_table_prefix."sites where site_id=$site_id"
;
+ $result = mysql_query("SELECT site_id, url, title, short_desc, spider_depth, required, disallowed, mustnot_include, can_leave_domain from ".$mysql_table_prefix."sites where site_id=$site_id"
;
echo mysql_error();
$row = mysql_fetch_array($result);
$depth = $row['spider_depth'];
@@ -296,7 +296,7 @@
<form action=admin.php method=post>
<input type=hidden name=f value=4>
<input type=hidden name=site_id value=<?php print $site_id;?>>
- <tr><td><b>URL:</b></td><td align ="right"></td><td><input type=text name=url value=<?php print "\"".$row['url']."\""?> size=60></td></tr>
+ <tr><td width="175"><b>URL:</b></td><td align ="right"></td><td><input type=text name=url value=<?php print "\"".$row['url']."\""?> size=48></td></tr>
<tr><td><b>Title:</b></td><td></td><td> <input type=text name=title value=<?php print "\"".stripslashes($row['title'])."\""?> size=60></td></tr>
<tr><td><b>Short description:</b></td><td></td><td><textarea name=short_desc cols=45 rows=3 wrap><?php print stripslashes($row['short_desc'])?></textarea></td></tr>
<tr><td><b>Spidering options:</b></td><td></td><td><input type="radio" name="soption" value="full" <?php print $fullchecked;?>> Full<br/>
@@ -305,6 +305,9 @@
</td></tr>
<tr><td><b>URLs must include:</b></td><td></td><td><textarea name=in cols=45 rows=2 wrap="virtual"><?php print $row['required'];?></textarea></td></tr>
<tr><td><b>URLs must not include:</b></td><td></td><td><textarea name=out cols=45 rows=2 wrap="virtual"><?php print $row['disallowed'];?></textarea></td></tr>
+ <tr><td><b>Content must not include:</b><br />
+ <span style="font-size: 75%">Use raw HTML. Case insensitive. One word or phrase per line. See the <a href="[www.sphider.eu]; for information on using regular expressions.</span>
+ </td><td></td><td><textarea name=mustnot_include cols=45 rows=2 wrap="virtual"><?php print $row['mustnot_include'];?></textarea></td></tr>
<tr><td>Category:</td><td></td><td>
<?php walk_through_cats(0, 0, $site_id);?></td></tr>
@@ -313,7 +316,7 @@
}
- function editsite ($site_id, $url, $title, $short_desc, $depth, $required, $disallowed, $domaincb, $cat) {
+ function editsite ($site_id, $url, $title, $short_desc, $depth, $required, $disallowed, $mustnot_include, $domaincb, $cat) {
global $mysql_table_prefix;
$short_desc = addslashes($short_desc);
$title = addslashes($title);
@@ -322,7 +325,7 @@
$compurl=parse_url($url);
if ($compurl['path']=='')
$url=$url."/";
- mysql_query("UPDATE ".$mysql_table_prefix."sites SET url='$url', title='$title', short_desc='$short_desc', spider_depth =$depth, required='$required', disallowed='$disallowed', can_leave_domain=$domaincb WHERE site_id=$site_id"
;
+ mysql_query("UPDATE ".$mysql_table_prefix."sites SET url='$url', title='$title', short_desc='$short_desc', spider_depth =$depth, required='$required', disallowed='$disallowed', mustnot_include='$mustnot_include', can_leave_domain=$domaincb WHERE site_id=$site_id"
;
echo mysql_error();
$result=mysql_query("select category_id from ".$mysql_table_prefix."categories"
;
echo mysql_error();
@@ -672,7 +675,7 @@
$advurl = "";
} else {
$advurl = $url;
- $result = mysql_query("select spider_depth, required, disallowed, can_leave_domain from ".$mysql_table_prefix."sites " .
+ $result = mysql_query("select spider_depth, required, disallowed, mustnot_include, can_leave_domain from ".$mysql_table_prefix."sites " .
"where url='$url'"
;
echo mysql_error();
if (mysql_num_rows($result) > 0) {
@@ -685,7 +688,8 @@
}
$must = $row[1];
$mustnot = $row[2];
- $canleave = $row[3];
+ $mustnot_include = $row[3];
+ $canleave = $row[4];
}
}
@@ -694,7 +698,7 @@
<ul>
<li>
<?php
- if ($must !="" || $mustnot !="" || $canleave == 1 ) {
+ if ($must !="" || $mustnot !="" || $canleave == 1 || $mustnot_include !="" ) {
$_SESSION['index_advanced']=1;
}
if ($_SESSION['index_advanced']==1){
@@ -710,7 +714,7 @@
<br/>
<div id="indexoptions"><table>
<form action="spider.php" method="post">
- <tr><td><b>Address:</b></td><td> <input type="text" name="url" size="48" value=<?php print "\"$url\"";?>></td></tr>
+ <tr><td width="175"<b>Address:</b></td><td> <input type="text" name="url" size="48" value=<?php print "\"$url\"";?>></td></tr>
<tr><td><b>Indexing options:</b></td><td>
<input type="radio" name="soption" value="full" <?php print $fullchecked;?>> Full<br/>
<input type="radio" name="soption" value="level" <?php print $levelchecked;?>>To depth: <input type="text" name="maxlevel" size="2" value="<?php print $spider_depth;?>"><br/>
@@ -724,6 +728,9 @@
<tr><td></td><td><input type="checkbox" name="domaincb" value="1" <?php print $checkcan;?>> Spider can leave domain <!--a href="javascript:;" onClick="window.open('hmm','newWindow','width=300,height=300,left=600,top=200,resizable');" >?</a--><br/></td></tr>
<tr><td><b>URL must include:</b></td><td><textarea name=in cols=35 rows=2 wrap="virtual"><?php print $must;?></textarea></td></tr>
<tr><td><b>URL must not include:</b></td><td><textarea name=out cols=35 rows=2 wrap="virtual"><?php print $mustnot;?></textarea></td></tr>
+ <tr><td><b>Content must not include:</b><br />
+ <span style="font-size: 75%">Use raw HTML. Case insensitive. One word or phrase per line. See the <a href="[www.sphider.eu]; for information on using regular expressions.</span>
+ </td><td><textarea name=mustnot_include cols=35 rows=2 wrap="virtual"><?php print $mustnot_include;?></textarea></td></tr>
<?php
}
?>
@@ -1176,7 +1183,7 @@
if ($soption =='full') {
$depth = -1;
}
- $message = editsite ($site_id, $url, $title, $short_desc, $depth, $in, $out, $domaincb, $cat);
+ $message = editsite ($site_id, $url, $title, $short_desc, $depth, $in, $out, $mustnot_include, $domaincb, $cat);
showsites($message);
break;
case 5:
Index: admin/messages.php
===================================================================
--- messages.php (revision 13242)
+++ messages.php (revision 13243)
@@ -52,6 +52,10 @@
"minWords" => Array (
0 => " <font color=\"red\">Page contains less than $min_words_per_page words</font><br>\n",
1 => " Page contains less than $min_words_per_page words.\n"
+ ),
+ "contentViolation" => Array (
+ 0 => " <font color=\"red\">The page content does not pass the content must include/must not include filters.</font><br>\n",
+ 1 => " The page content does not pass the content must include/must not include filters.\n"
)
);
Index: admin/spider.php
===================================================================
--- spider.php (revision 13242)
+++ spider.php (revision 13243)
@@ -103,14 +103,15 @@
} else {
if ($reindex == 1 && $command_line == 1) {
- $result=mysql_query("select url, spider_depth, required, disallowed, can_leave_domain from ".$mysql_table_prefix."sites where url='$url'"
;
+ $result=mysql_query("select url, spider_depth, required, disallowed, mustnot_include, can_leave_domain from ".$mysql_table_prefix."sites where url='$url'"
;
echo mysql_error();
if($row=mysql_fetch_row($result)) {
$url = $row[0];
$maxlevel = $row[1];
$in= $row[2];
$out = $row[3];
- $domaincb = $row[4];
+ $mustnot_include = $row[4];
+ $domaincb = $row[5];
if ($domaincb=='') {
$domaincb=0;
}
@@ -128,8 +129,11 @@
if (!isset($out)) {
$out = "";
}
+ if (!isset($mustnot_include)) {
+ $mustnot_include = "";
+ }
- index_site($url, $reindex, $maxlevel, $soption, $in, $out, $domaincb);
+ index_site($url, $reindex, $maxlevel, $soption, $in, $out, $mustnot_include, $domaincb);
}
@@ -142,7 +146,7 @@
}
- function index_url($url, $level, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex) {
+ function index_url($url, $level, $site_id, $md5sum, $domain, $indexdate, $sessid, $mustnot_include, $can_leave_domain, $reindex) {
global $entities, $min_delay;
global $command_line;
global $min_words_per_page;
@@ -217,19 +221,24 @@
}
printStandardReport('starting', $command_line);
-
+
+ if (!check_include($file, "", $mustnot_include)) {
+ $deletable = 1;
+ $OKtoIndex = 0;
+ printStandardReport('contentViolation',$command_line);
+ }
+ else {
+ $newmd5sum = md5($file);
- $newmd5sum = md5($file);
-
+ if ($md5sum == $newmd5sum) {
+ printStandardReport('md5notChanged',$command_line);
+ $OKtoIndex = 0;
+ } else if (isDuplicateMD5($newmd5sum)) {
+ $OKtoIndex = 0;
+ printStandardReport('duplicate',$command_line);
+ }
+ }
- if ($md5sum == $newmd5sum) {
- printStandardReport('md5notChanged',$command_line);
- $OKtoIndex = 0;
- } else if (isDuplicateMD5($newmd5sum)) {
- $OKtoIndex = 0;
- printStandardReport('duplicate',$command_line);
- }
-
if (($md5sum != $newmd5sum || $reindex ==1) && $OKtoIndex == 1) {
$urlparts = parse_url($url);
$newdomain = $urlparts['host'];
@@ -356,7 +365,7 @@
}
- function index_site($url, $reindex, $maxlevel, $soption, $url_inc, $url_not_inc, $can_leave_domain) {
+ function index_site($url, $reindex, $maxlevel, $soption, $url_inc, $url_not_inc, $mustnot_include, $can_leave_domain) {
global $mysql_table_prefix, $command_line, $mainurl, $tmp_urls, $domain_arr, $all_keywords;
if (!isset($all_keywords)) {
$result = mysql_query("select keyword_ID, keyword from ".$mysql_table_prefix."keywords"
;
@@ -411,19 +420,19 @@
}
$qry = "update ".$mysql_table_prefix."sites set indexdate=now(), spider_depth = $maxlevel, required = '$url_inc'," .
- "disallowed = '$url_not_inc', can_leave_domain=$can_leave_domain where site_id=$site_id";
+ "disallowed = '$url_not_inc', mustnot_include = '$mustnot_include', can_leave_domain=$can_leave_domain where site_id=$site_id";
mysql_query ($qry);
echo mysql_error();
} else if ($site_id == '') {
- mysql_query ("insert into ".$mysql_table_prefix."sites (url, indexdate, spider_depth, required, disallowed, can_leave_domain) " .
- "values ('$url', now(), $maxlevel, '$url_inc', '$url_not_inc', $can_leave_domain)"
;
+ mysql_query ("insert into ".$mysql_table_prefix."sites (url, indexdate, spider_depth, required, disallowed, mustnot_include, can_leave_domain) " .
+ "values ('$url', now(), $maxlevel, '$url_inc', '$url_not_inc', '$mustnot_include', $can_leave_domain)"
;
echo mysql_error();
$result = mysql_query("select site_ID from ".$mysql_table_prefix."sites where url='$url'"
;
$row = mysql_fetch_row($result);
$site_id = $row[0];
} else {
mysql_query ("update ".$mysql_table_prefix."sites set indexdate=now(), spider_depth = $maxlevel, required = '$url_inc'," .
- "disallowed = '$url_not_inc', can_leave_domain=$can_leave_domain where site_id=$site_id"
;
+ "disallowed = '$url_not_inc', mustnot_include = '$mustnot_include', can_leave_domain=$can_leave_domain where site_id=$site_id"
;
echo mysql_error();
}
@@ -529,7 +538,7 @@
echo mysql_error();
$rows = mysql_num_rows($result);
if ($rows == 0) {
- index_url($thislink, $level+1, $site_id, '', $domain, '', $sessid, $can_leave_domain, $reindex);
+ index_url($thislink, $level+1, $site_id, '', $domain, '', $sessid, $mustnot_include, $can_leave_domain, $reindex);
mysql_query("update ".$mysql_table_prefix."pending set level = $level, count=$count, num=$num where site_id=$site_id"
;
echo mysql_error();
@@ -537,7 +546,7 @@
$row = mysql_fetch_array($result);
$md5sum = $row['md5sum'];
$indexdate = $row['indexdate'];
- index_url($thislink, $level+1, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex);
+ index_url($thislink, $level+1, $site_id, $md5sum, $domain, $indexdate, $sessid, $mustnot_include, $can_leave_domain, $reindex);
mysql_query("update ".$mysql_table_prefix."pending set level = $level, count=$count, num=$num where site_id=$site_id"
;
echo mysql_error();
}else {
@@ -561,23 +570,24 @@
function index_all() {
global $mysql_table_prefix;
- $result=mysql_query("select url, spider_depth, required, disallowed, can_leave_domain from ".$mysql_table_prefix."sites"
;
+ $result=mysql_query("select url, spider_depth, required, disallowed, mustnot_include, can_leave_domain from ".$mysql_table_prefix."sites"
;
echo mysql_error();
- while ($row=mysql_fetch_row($result)) {
- $url = $row[0];
- $depth = $row[1];
- $include = $row[2];
- $not_include = $row[3];
- $can_leave_domain = $row[4];
- if ($can_leave_domain=='') {
- $can_leave_domain=0;
- }
- if ($depth == -1) {
- $soption = 'full';
- } else {
- $soption = 'level';
- }
- index_site($url, 1, $depth, $soption, $include, $not_include, $can_leave_domain);
+ while ($row=mysql_fetch_row($result)) {
+ $url = $row[0];
+ $depth = $row[1];
+ $include = $row[2];
+ $not_include = $row[3];
+ $mustnot_include = $row[4];
+ $can_leave_domain = $row[5];
+ if ($can_leave_domain=='') {
+ $can_leave_domain=0;
+ }
+ if ($depth == -1) {
+ $soption = 'full';
+ } else {
+ $soption = 'level';
+ }
+ index_site($url, 1, $depth, $soption, $include, $not_include, $mustnot_include, $can_leave_domain);
}
}