Welcome! Log In Create A New Profile

Advanced

HOW TO: Add a "does not include" filter for page content

Posted by chrisbloom7 
HOW TO: Add a "does not include" filter for page content
April 13, 2010 07:31PM
Here's a set of unified diffs that will add a field to the indexing form where you can specify a list of text that, when found during an index, will cause Sphider to ignore the current page and remove it from the catalog if it is already indexed. This works almost exactly like the existing URL Must Include/Ignore functionality (http://www.sphider.eu/docs.php#mustinc), except that it works on the HTML of the page that is currently being scanned, and it accepts phrases in addition to single words.

For example, I needed to exclude (and not just exclude, but remove from the existing catalog) any page that included the text "<h2>Error: Product Not Found</h2>" or "This category has no products". I can now enter those two terms into the new text area on the Site Edit form (also on the Index form, in both cases under Advanced Options) separating both using a new line, and when I run the reindex those pages will be ignored and/or removed from the catalog. Regular expressions should work here using the same format as specified at the docs link above, though I haven't tried it myself.

Index: admin/admin.php
===================================================================
--- admin.php	(revision 13242)
+++ admin.php	(revision 13243)
@@ -271,7 +271,7 @@
 
 	function editsiteform($site_id) {
 		global $mysql_table_prefix;
-		$result = mysql_query("SELECT site_id, url, title, short_desc, spider_depth, required, disallowed, can_leave_domain from ".$mysql_table_prefix."sites where site_id=$site_id"winking smiley;
+		$result = mysql_query("SELECT site_id, url, title, short_desc, spider_depth, required, disallowed, mustnot_include, can_leave_domain from ".$mysql_table_prefix."sites where site_id=$site_id"winking smiley;
 		echo mysql_error();
 		$row = mysql_fetch_array($result);
 		$depth = $row['spider_depth'];
@@ -296,7 +296,7 @@
 			<form action=admin.php method=post>
 			<input type=hidden name=f value=4>
 			<input type=hidden name=site_id value=<?php print $site_id;?>>
-			<tr><td><b>URL:</b></td><td align ="right"></td><td><input type=text name=url value=<?php print "\"".$row['url']."\""?> size=60></td></tr>
+			<tr><td width="175"><b>URL:</b></td><td align ="right"></td><td><input type=text name=url value=<?php print "\"".$row['url']."\""?> size=48></td></tr>
 			<tr><td><b>Title:</b></td><td></td><td> <input type=text name=title value=<?php print  "\"".stripslashes($row['title'])."\""?> size=60></td></tr>
 			<tr><td><b>Short description:</b></td><td></td><td><textarea name=short_desc cols=45 rows=3 wrap><?php print stripslashes($row['short_desc'])?></textarea></td></tr>
 			<tr><td><b>Spidering options:</b></td><td></td><td><input type="radio" name="soption" value="full" <?php print $fullchecked;?>> Full<br/>
@@ -305,6 +305,9 @@
 			</td></tr>			
 			<tr><td><b>URLs must include:</b></td><td></td><td><textarea name=in cols=45 rows=2 wrap="virtual"><?php print $row['required'];?></textarea></td></tr>
 			<tr><td><b>URLs must not include:</b></td><td></td><td><textarea name=out cols=45 rows=2 wrap="virtual"><?php print $row['disallowed'];?></textarea></td></tr>
+			<tr><td><b>Content must not include:</b><br />
+        <span style="font-size: 75%">Use raw HTML. Case insensitive. One word or phrase per line. See the <a href="http://www.sphider.eu/docs.php#mustinc">documentation</a> for information on using regular expressions.</span>
+        </td><td></td><td><textarea name=mustnot_include cols=45 rows=2 wrap="virtual"><?php print $row['mustnot_include'];?></textarea></td></tr>
 			
 			<tr><td>Category:</td><td></td><td>
 			<?php  walk_through_cats(0, 0, $site_id);?></td></tr>
@@ -313,7 +316,7 @@
 		}
 
 
-		function editsite ($site_id, $url, $title, $short_desc, $depth, $required, $disallowed, $domaincb,  $cat) {
+		function editsite ($site_id, $url, $title, $short_desc, $depth, $required, $disallowed, $mustnot_include, $domaincb,  $cat) {
 			global $mysql_table_prefix;
 			$short_desc = addslashes($short_desc);
 			$title = addslashes($title);
@@ -322,7 +325,7 @@
 			$compurl=parse_url($url);
 			if ($compurl['path']=='')
 				$url=$url."/";
-			mysql_query("UPDATE ".$mysql_table_prefix."sites SET url='$url', title='$title', short_desc='$short_desc', spider_depth =$depth, required='$required', disallowed='$disallowed', can_leave_domain=$domaincb WHERE site_id=$site_id"winking smiley;
+			mysql_query("UPDATE ".$mysql_table_prefix."sites SET url='$url', title='$title', short_desc='$short_desc', spider_depth =$depth, required='$required', disallowed='$disallowed', mustnot_include='$mustnot_include', can_leave_domain=$domaincb WHERE site_id=$site_id"winking smiley;
 			echo mysql_error();
 			$result=mysql_query("select category_id from ".$mysql_table_prefix."categories"winking smiley;
 			echo mysql_error();
@@ -672,7 +675,7 @@
 			$advurl = "";
 		} else {
 			$advurl = $url;
-			$result = mysql_query("select spider_depth, required, disallowed, can_leave_domain from ".$mysql_table_prefix."sites " .
+			$result = mysql_query("select spider_depth, required, disallowed, mustnot_include, can_leave_domain from ".$mysql_table_prefix."sites " .
 					"where url='$url'"winking smiley;
 			echo mysql_error();
 			if (mysql_num_rows($result) > 0) {
@@ -685,7 +688,8 @@
 				}
 				$must = $row[1];
 				$mustnot = $row[2];
-				$canleave = $row[3];
+        $mustnot_include = $row[3];
+				$canleave = $row[4];
 			}			
 		}
 
@@ -694,7 +698,7 @@
 			<ul>
 				<li>
 				<?php 
-				if ($must !="" || $mustnot !="" || $canleave == 1 ) {	
+				if ($must !="" || $mustnot !="" || $canleave == 1 || $mustnot_include !="" ) {	
 					$_SESSION['index_advanced']=1;
 				}
 				if ($_SESSION['index_advanced']==1){
@@ -710,7 +714,7 @@
 		<br/>
 		<div id="indexoptions"><table>
 		<form action="spider.php" method="post">
-		<tr><td><b>Address:</b></td><td> <input type="text" name="url" size="48" value=<?php print "\"$url\"";?>></td></tr>
+		<tr><td width="175"<b>Address:</b></td><td> <input type="text" name="url" size="48" value=<?php print "\"$url\"";?>></td></tr>
 		<tr><td><b>Indexing options:</b></td><td>
 		<input type="radio" name="soption" value="full" <?php print $fullchecked;?>> Full<br/>
 		<input type="radio" name="soption" value="level" <?php print $levelchecked;?>>To depth: <input type="text" name="maxlevel" size="2" value="<?php print $spider_depth;?>"><br/>
@@ -724,6 +728,9 @@
 			<tr><td></td><td><input type="checkbox" name="domaincb" value="1" <?php print $checkcan;?>> Spider can leave domain <!--a href="javascript:;" onClick="window.open('hmm','newWindow','width=300,height=300,left=600,top=200,resizable');" >?</a--><br/></td></tr>
 			<tr><td><b>URL must include:</b></td><td><textarea name=in cols=35 rows=2 wrap="virtual"><?php print $must;?></textarea></td></tr>
 			<tr><td><b>URL must not include:</b></td><td><textarea name=out cols=35 rows=2 wrap="virtual"><?php print $mustnot;?></textarea></td></tr>
+			<tr><td><b>Content must not include:</b><br />
+        <span style="font-size: 75%">Use raw HTML. Case insensitive. One word or phrase per line. See the <a href="http://www.sphider.eu/docs.php#mustinc">documentation</a> for information on using regular expressions.</span>
+        </td><td><textarea name=mustnot_include cols=35 rows=2 wrap="virtual"><?php print $mustnot_include;?></textarea></td></tr>
 			<?php 
 		}
 		?>
@@ -1176,7 +1183,7 @@
 			if ($soption =='full') {
 				$depth = -1;
 			} 
-			$message = editsite ($site_id, $url, $title, $short_desc, $depth, $in, $out,  $domaincb, $cat);
+			$message = editsite ($site_id, $url, $title, $short_desc, $depth, $in, $out, $mustnot_include, $domaincb, $cat);
 			showsites($message);
 		break;
 		case 5:


Index: admin/messages.php
===================================================================
--- messages.php	(revision 13242)
+++ messages.php	(revision 13243)
@@ -52,6 +52,10 @@
 "minWords" => Array (
 	0 => " <font color=\"red\">Page contains less than $min_words_per_page words</font><br>\n",
 	1 => " Page contains less than $min_words_per_page words.\n"
+    ),
+    "contentViolation" => Array (
+      0 => " <font color=\"red\">The page content does not pass the content must include/must not include filters.</font><br>\n",
+      1 => " The page content does not pass the content must include/must not include filters.\n"
  )
 );
 

Index: admin/spider.php
===================================================================
--- spider.php	(revision 13242)
+++ spider.php	(revision 13243)
@@ -103,14 +103,15 @@
 	} else {
 
 		if ($reindex == 1 && $command_line == 1) {
-			$result=mysql_query("select url, spider_depth, required, disallowed, can_leave_domain from ".$mysql_table_prefix."sites where url='$url'"winking smiley;
+			$result=mysql_query("select url, spider_depth, required, disallowed, mustnot_include, can_leave_domain from ".$mysql_table_prefix."sites where url='$url'"winking smiley;
 			echo mysql_error();
 			if($row=mysql_fetch_row($result)) {
 				$url = $row[0];
 				$maxlevel = $row[1];
 				$in= $row[2];
 				$out = $row[3];
-				$domaincb = $row[4];
+        $mustnot_include = $row[4];
+				$domaincb = $row[5];
 				if ($domaincb=='') {
 					$domaincb=0;
 				}
@@ -128,8 +129,11 @@
 		if (!isset($out)) {
 			$out = "";
 		}
+		if (!isset($mustnot_include)) {
+			$mustnot_include = "";
+		}
 
-		index_site($url, $reindex, $maxlevel, $soption, $in, $out, $domaincb);
+		index_site($url, $reindex, $maxlevel, $soption, $in, $out, $mustnot_include, $domaincb);
 
 	}
 
@@ -142,7 +146,7 @@
 	}
 
 	
-	function index_url($url, $level, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex) {
+	function index_url($url, $level, $site_id, $md5sum, $domain, $indexdate, $sessid, $mustnot_include, $can_leave_domain, $reindex) {
 		global $entities, $min_delay;
 		global $command_line;
 		global $min_words_per_page;
@@ -217,19 +221,24 @@
 			}
 			
 			printStandardReport('starting', $command_line);
-		
+      
+			if (!check_include($file, "", $mustnot_include)) {
+      	$deletable = 1;
+      	$OKtoIndex = 0;
+				printStandardReport('contentViolation',$command_line);
+			}
+      else {
+        $newmd5sum = md5($file);
 
-			$newmd5sum = md5($file);
-			
+        if ($md5sum == $newmd5sum) {
+          printStandardReport('md5notChanged',$command_line);
+          $OKtoIndex = 0;
+        } else if (isDuplicateMD5($newmd5sum)) {
+          $OKtoIndex = 0;
+          printStandardReport('duplicate',$command_line);
+        }
+      }
 
-			if ($md5sum == $newmd5sum) {
-				printStandardReport('md5notChanged',$command_line);
-				$OKtoIndex = 0;
-			} else if (isDuplicateMD5($newmd5sum)) {
-				$OKtoIndex = 0;
-				printStandardReport('duplicate',$command_line);
-			}
-
 			if (($md5sum != $newmd5sum || $reindex ==1) && $OKtoIndex == 1) {
 				$urlparts = parse_url($url);
 				$newdomain = $urlparts['host'];
@@ -356,7 +365,7 @@
 	}
 
 
-	function index_site($url, $reindex, $maxlevel, $soption, $url_inc, $url_not_inc, $can_leave_domain) {
+	function index_site($url, $reindex, $maxlevel, $soption, $url_inc, $url_not_inc, $mustnot_include, $can_leave_domain) {
 		global $mysql_table_prefix, $command_line, $mainurl,  $tmp_urls, $domain_arr, $all_keywords;
 		if (!isset($all_keywords)) {
 		$result = mysql_query("select keyword_ID, keyword from ".$mysql_table_prefix."keywords"winking smiley;
@@ -411,19 +420,19 @@
 			}
 			
 			$qry = "update ".$mysql_table_prefix."sites set indexdate=now(), spider_depth = $maxlevel, required = '$url_inc'," .
-					"disallowed = '$url_not_inc', can_leave_domain=$can_leave_domain where site_id=$site_id";
+					"disallowed = '$url_not_inc', mustnot_include = '$mustnot_include', can_leave_domain=$can_leave_domain where site_id=$site_id";
 			mysql_query ($qry);
 			echo mysql_error();
 		} else if ($site_id == '') {
-			mysql_query ("insert into ".$mysql_table_prefix."sites (url, indexdate, spider_depth, required, disallowed, can_leave_domain) " .
-					"values ('$url', now(), $maxlevel, '$url_inc', '$url_not_inc', $can_leave_domain)"winking smiley;
+			mysql_query ("insert into ".$mysql_table_prefix."sites (url, indexdate, spider_depth, required, disallowed, mustnot_include, can_leave_domain) " .
+					"values ('$url', now(), $maxlevel, '$url_inc', '$url_not_inc', '$mustnot_include', $can_leave_domain)"winking smiley;
 			echo mysql_error();
 			$result = mysql_query("select site_ID from ".$mysql_table_prefix."sites where url='$url'"winking smiley;
 			$row = mysql_fetch_row($result);
 			$site_id = $row[0];
 		} else {
 			mysql_query ("update ".$mysql_table_prefix."sites set indexdate=now(), spider_depth = $maxlevel, required = '$url_inc'," .
-					"disallowed = '$url_not_inc', can_leave_domain=$can_leave_domain where site_id=$site_id"winking smiley;
+					"disallowed = '$url_not_inc', mustnot_include = '$mustnot_include', can_leave_domain=$can_leave_domain where site_id=$site_id"winking smiley;
 			echo mysql_error();
 		}
 	
@@ -529,7 +538,7 @@
 					echo mysql_error();
 					$rows = mysql_num_rows($result);
 					if ($rows == 0) {
-						index_url($thislink, $level+1, $site_id, '',  $domain, '', $sessid, $can_leave_domain, $reindex);
+						index_url($thislink, $level+1, $site_id, '',  $domain, '', $sessid, $mustnot_include, $can_leave_domain, $reindex);
 
 						mysql_query("update ".$mysql_table_prefix."pending set level = $level, count=$count, num=$num where site_id=$site_id"winking smiley;
 						echo mysql_error();
@@ -537,7 +546,7 @@
 						$row = mysql_fetch_array($result);
 						$md5sum = $row['md5sum'];
 						$indexdate = $row['indexdate'];
-						index_url($thislink, $level+1, $site_id, $md5sum,  $domain, $indexdate, $sessid, $can_leave_domain, $reindex);
+						index_url($thislink, $level+1, $site_id, $md5sum,  $domain, $indexdate, $sessid, $mustnot_include, $can_leave_domain, $reindex);
 						mysql_query("update ".$mysql_table_prefix."pending set level = $level, count=$count, num=$num where site_id=$site_id"winking smiley;
 						echo mysql_error();
 					}else {
@@ -561,23 +570,24 @@
 
 	function index_all() {
 		global $mysql_table_prefix;
-		$result=mysql_query("select url, spider_depth, required, disallowed, can_leave_domain from ".$mysql_table_prefix."sites"winking smiley;
+		$result=mysql_query("select url, spider_depth, required, disallowed, mustnot_include, can_leave_domain from ".$mysql_table_prefix."sites"winking smiley;
 		echo mysql_error();
-    	while ($row=mysql_fetch_row($result)) {
-    		$url = $row[0];
-	   		$depth = $row[1];
-    		$include = $row[2];
-    		$not_include = $row[3];
-    		$can_leave_domain = $row[4];
-    		if ($can_leave_domain=='') {
-    			$can_leave_domain=0;
-    		}
-    		if ($depth == -1) {
-    			$soption = 'full';
-    		} else {
-    			$soption = 'level';
-    		}
-			index_site($url, 1, $depth, $soption, $include, $not_include, $can_leave_domain);
+    while ($row=mysql_fetch_row($result)) {
+      $url = $row[0];
+      $depth = $row[1];
+      $include = $row[2];
+      $not_include = $row[3];
+      $mustnot_include = $row[4];
+      $can_leave_domain = $row[5];
+      if ($can_leave_domain=='') {
+        $can_leave_domain=0;
+      }
+      if ($depth == -1) {
+        $soption = 'full';
+      } else {
+        $soption = 'level';
+      }
+			index_site($url, 1, $depth, $soption, $include, $not_include, $mustnot_include, $can_leave_domain);
 		}
 	}

Note that you could easily add a "content must include" filter as well following my lead.

Oh, and this assumes that your `sites` table includes a field named `mustnot_include` which is of type TEXT. I already had one in my table from a prior upgrade, though it appeared to be unused.

One last thing, the diffs are based on the latest release of Sphider, 1.3.5.
Sorry, only registered users may post in this forum.

Click here to login