<?php
set_time_limit(0);
function readContents($url) {
if ($stream = fopen($url, 'r')) {
return stream_get_contents($stream);
fclose($stream);
} else {
return false;
}
}
function stripString($sourceString, $stripArray, $replace = "") {
foreach ($stripArray as $key => $value) {
$sourceString = str_replace($value, $replace, $sourceString);
}
return $sourceString;
}
function unsetUseless($sourceArray) {
foreach ($sourceArray as $sourceKey => $sourceValue) {
if ($sourceValue == "" || $sourceValue == " " || is_null($sourceValue) || substr_count($sourceValue, "href") == 0) {
unset ($sourceArray[$sourceKey]);
}
}
return $sourceArray;
}
function getStringBetween($string, $start, $end){
$string = " ".$string;
$ini = strpos($string,$start);
if ($ini == 0) return "";
$ini += strlen($start);
$len = strpos($string,$end,$ini) - $ini;
return substr($string,$ini,$len);
}
/*----------START SCRIPT----------*/
//Open connection to mysql databse.
$conn = mysql_connect("localhost", "root", "");
mysql_select_db("laughpolice", $conn);
//First define the category page arrays.
$categoryPageArray = array(
"http://www.all4humor.com/jokes/index_32.html",
"http://www.all4humor.com/jokes/index_33.html",
"http://www.all4humor.com/jokes/index_34.html",
"http://www.all4humor.com/jokes/index_35.html",
"http://www.all4humor.com/jokes/index_36.html",
"http://www.all4humor.com/jokes/index_37.html",
"http://www.all4humor.com/jokes/index_38.html",
"http://www.all4humor.com/jokes/index_39.html",
"http://www.all4humor.com/jokes/index_40.html",
"http://www.all4humor.com/jokes/index_41.html",
"http://www.all4humor.com/jokes/index_42.html",
"http://www.all4humor.com/jokes/index_43.html",
"http://www.all4humor.com/jokes/index_44.html",
"http://www.all4humor.com/jokes/index_45.html",
"http://www.all4humor.com/jokes/index_46.html",
"http://www.all4humor.com/jokes/index_47.html",
"http://www.all4humor.com/jokes/index_48.html",
"http://www.all4humor.com/jokes/index_49.html",
"http://www.all4humor.com/jokes/index_50.html",
"http://www.all4humor.com/jokes/index_51.html",
"http://www.all4humor.com/jokes/index_52.html");
//Define what each array element represents
//$categoryDescArray = array("DUMB", "BLONDE", "DIRTY", "GROSS", "CLEAN", "KNOCKKNOCK", "YOMAMMA", "MEXICAN",
// "KIDS", "WEDDING", "IRISH", "GOLF", "REALLYFUNNY", "REDNECK", "BAR");
//Open the main loop - this will visit each category page, obtain the list of jokes, visit each joke page,
//and save the joke.
foreach ($categoryPageArray as $categoryPageKey => $categoryPageValue) {
//Read page contents into a string
$currentCategoryPageSource = readContents($categoryPageValue);
//Get the part of the page that we want.
$currentCategoryPageSource = getStringBetween($currentCategoryPageSource,
"<div id=\"videodiv\" class=\"centercontentdiv\">",
"</div></div></div>");
//Strip useless html tags and data from it.
$uselessData = array("\n", "<div class=\"contentlistdiv\">", "<div class=\"contentlisttitle\" style=\"width:455px;float:left;\">",
" class=\"contentlisttitle\"", "</a>", "</div>");
$currentCategoryPageSource = stripString($currentCategoryPageSource, $uselessData);
//Explode our string into strings starting with <a "
$currentCategoryLinkArray = explode("<a ", $currentCategoryPageSource);
//Strip useless data from our exploded array
$currentCategoryLinkArray = unsetUseless($currentCategoryLinkArray);
foreach ($currentCategoryLinkArray as $linkArrayKey => $linkArrayValue) {
$currentCategoryLinkArray[$linkArrayKey] = getStringBetween($linkArrayValue, "=\"", "\">");
}
//Start the loop which will visit each joke page, obtain the joke, and print it to the browser.
foreach ($currentCategoryLinkArray as $linkArrayKey => $linkArrayValue) {
//Read joke page contents into a string
$jokePageSource = readContents($linkArrayValue);
//Get the title
$jokeTitle = mysql_real_escape_string(getStringBetween($jokePageSource, "<title>", "</title>"));
//Get the part of the page we want.
$jokePageSource = getStringBetween($jokePageSource,
" class=\"subcatlist\" style=\"padding:5px\">",
"</td></tr>\n <tr><td><img src=\"/img/content_btmbox_r3_c1.gif\"></td></tr>\n <tr><td><img src=\"/img/spacer.gif\" height=\"10\"></td></tr>\n <tr><td height=\"10\" style=\"background-image: url(/img/content_btmbox_r1_c1.gif)\"></td></tr>\n <tr><td style=\"background-image: url(/img/content_btmbox_r2_c1.gif)\" class=\"subcatlist\">\n <div style=\"width:220px;height:22px;float:left;\" align=\"center\"");
//Strip useless data out of $jokePageSource
$jokePageSource = stripString($jokePageSource, array("</font>", "</p>", "<font"," face=","\"Arial\""," size=","\"6\">",
"bgcolor=\"#FFFFFF\">", "\<td align=\"center\" ", "<tr>",
"<table border=\"2\"width=\"100%\">", "<td>", "<tr>",
"<p align=\"left\">", "<td align=\"center\"", "\"2\"><BR><STRONG>",
":</STRONG>", "</tr>", "</td>", "\"2\">", "<P>", "\n", "arial2>", "<p>", "</p>"));
$jokePageSource = trim($jokePageSource);
//Obtain joke category
$jokeCategory = "ALL4HUMOR";
//Obtain joke
$actualJoke = mysql_real_escape_string($jokePageSource);
//Write SQL Query
$sqlQuery = "INSERT INTO jokedata (joketitle, jokecategory, joke, datesubmitted) VALUES('$jokeTitle', '$jokeCategory', '$actualJoke', NOW())";
//Query the database
mysql_query($sqlQuery, $conn);
//echo data to browser
// echo "<h1>$jokeCategory - $jokeTitle</h1><XMP>$jokePageSource</XMP>";
}
}
//Close the connection to the database.
mysql_close($conn);
?>