#!/usr/bin/php
\n"; else $NL = "\n";
if ($show_html) $HR = "
\n"; else $HR = "_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n";
if ($show_html) $B = ""; else $B = "!";
if ($show_html) $B_ = ""; else $B_ = "!";
/*
* Start of main()
*/
if ($show_html)
{
echo "";
}
$keywords = explode(",", $test_keywords);
if (!count($keywords)) die ("Error: no keywords defined.$NL");
if (!rmkdir($working_dir)) die("Failed to create/open $working_dir$NL");
$country_data = get_google_cc($test_country, $test_language);
if (!$country_data) die("Invalid country/language code specified.$NL");
$ready = get_license();
if (!$ready) die("The specified API key account for user $uid is not active or invalid. $NL");
if ($PLAN['protocol'] != "http") die("Wrong proxy protocol configured, switch to HTTP. $NL");
echo "$NL$B Search Engine Scraper for $test_website_url initated $B_ $NL$NL";
/*
* This loop iterates through all keyword combinations
*/
$ch = NULL;
$rotate_ip = 0; // variable that triggers an IP rotation (normally only during keyword changes)
$max_errors_total = 3; // abort script if there are 3 keywords that can not be scraped (something is going wrong and needs to be checked)
$rank_data = array();
$siterank_data = array();
$break=0; // variable used to cancel loop without losing ranking data
foreach ($keywords as $keyword)
{
$rank = 0;
$max_errors_page = 5; // abort script if there are 5 errors in a row, that should not happen
if ($test_max_pages <= 0) break;
$search_string = urlencode($keyword);
$rotate_ip = 1; // IP rotation for each new keyword
/*
* This loop iterates through all result pages for the given keyword
*/
for ($page = 0; $page < $test_max_pages; $page++)
{
$serp_data = load_cache($search_string, $page, $country_data, $force_cache); // load results from local cache if available for today
$maxpages = 0;
if (!$serp_data)
{
$ip_ready = check_ip_usage(); // test if ip has not been used within the critical time
while (!$ip_ready || $rotate_ip)
{
$ok = rotate_proxy(); // start/rotate to the IP that has not been started for the longest time, also tests if proxy connection is working
if ($ok != 1)
{
die ("Fatal error: proxy rotation failed:$NL $ok$NL");
}
$ip_ready = check_ip_usage(); // test if ip has not been used within the critical time
if (!$ip_ready)
{
die("ERROR: No fresh IPs left, try again later. $NL");
} else
{
$rotate_ip = 0; // ip rotated
break; // continue
}
}
delay_time(); // stop scraping based on the license size to spread scrapes best possible and avoid detection
global $scrape_result; // contains metainformation from the scrape_serp_google() function
$raw_data = scrape_google($search_string, $page, $country_data); // scrape html from search engine
if ($scrape_result != "SCRAPE_SUCCESS")
{
if ($max_errors_page--)
{
echo "There was an error scraping (Code: $scrape_result), trying again .. $NL";
$page--;
continue;
} else
{
$page--;
if ($max_errors_total--)
{
echo "Too many errors scraping keyword $search_string (at page $page). Skipping remaining pages of keyword $search_string .. $NL";
break;
} else
{
die ("ERROR: Max keyword errors reached, something is going wrong. $NL");
}
break;
}
}
mark_ip_usage(); // store IP usage, this is very important to avoid detection and gray/blacklistings
global $process_result; // contains metainformation from the process_raw() function
$serp_data = process_raw_v2($raw_data, $page); // process the html and put results into $serp_data
if (($process_result == "PROCESS_SUCCESS_MORE") || ($process_result == "PROCESS_SUCCESS_LAST"))
{
$result_count = count($serp_data);
$serp_data['page'] = $page;
if ($process_result != "PROCESS_SUCCESS_LAST")
{
$serp_data['lastpage'] = 1;
} else
{
$serp_data['lastpage'] = 0;
}
$serp_data['keyword'] = $keyword;
$serp_data['cc'] = $country_data['cc'];
$serp_data['lc'] = $country_data['lc'];
$serp_data['result_count'] = $result_count;
store_cache($serp_data, $search_string, $page, $country_data); // store results into local cache
}
if ($process_result != "PROCESS_SUCCESS_MORE")
{
$break=1;
//break;
} // last page
if (!$load_all_ranks)
{
for ($n = 0; $n < $result_count; $n++)
if (strstr($results[$n]['url'], $test_website_url))
{
verbose("Located $test_website_url within search results.$NL");
$break=1;
//break;
}
}
} // scrape clause
$result_count = $serp_data['result_count'];
for ($ref = 0; $ref < $result_count; $ref++)
{
$rank++;
$rank_data[$keyword][$rank]['title'] = $serp_data[$ref]['title'];
$rank_data[$keyword][$rank]['url'] = $serp_data[$ref]['url'];
$rank_data[$keyword][$rank]['host'] = $serp_data[$ref]['host'];
$rank_data[$keyword][$rank]['desc'] = $serp_data[$ref]['desc'];
$rank_data[$keyword][$rank]['type'] = $serp_data[$ref]['type'];
//$rank_data[$keyword][$rank]['desc']=$serp_data['desc'']; // not really required
if (strstr($rank_data[$keyword][$rank]['url'], $test_website_url))
{
$info = array();
$info['rank'] = $rank;
$info['url'] = $rank_data[$keyword][$rank]['url'];
$siterank_data[$keyword][] = $info;
}
}
if ($break == 1) break;
} // page loop
} // keyword loop
if ($show_all_ranks)
{
foreach ($rank_data as $keyword => $ranks)
{
echo "$NL$NL$B" . "Ranking information for keyword \"$keyword\" $B_$NL";
echo "$B" . "Rank [Type] - Website - Title$B_$NL";
$pos = 0;
foreach ($ranks as $rank)
{
$pos++;
if (strstr($rank['url'], $test_website_url))
{
echo "$B$pos [$rank[type]] - $rank[url] - $rank[title] $B_$NL";
// echo $rank['desc']."\n";
} else
{
echo "$pos [$rank[type]] - $rank[url] - $rank[title] $NL";
// echo $rank['desc']."\n";
}
}
}
}
foreach ($keywords as $keyword)
{
if (!isset($siterank_data[$keyword]))
{
echo "$NL$B" . "The specified site was not found in the search results for keyword \"$keyword\". $B_$NL";
} else
{
$siteranks = $siterank_data[$keyword];
echo "$NL$NL$B" . "Ranking information for keyword \"$keyword\" and website \"$test_website_url\" [$test_country / $test_language] $B_$NL";
foreach ($siteranks as $siterank)
echo "Rank $siterank[rank] for URL $siterank[url]$NL";
}
}
//var_dump($siterank_data);
if ($show_html)
{
echo "";
}
?>