Showing posts with label Yahoo SERP Scraper. Show all posts
Showing posts with label Yahoo SERP Scraper. Show all posts

Multi SERP, Suggested Keywords Scraper

Tuesday, 6 January 2009

It uses curl_multi_exec to scrape live, ask, yahoo and google for suggested keywords based on a keyword you provide.

At the moment it just outputs the keywords as hyperlinks to fetch longer tails keywords based on the actual suggestion. But, with just a minor tweak it can output as plain text and with a couple of lines of extra code could even save em to a file/db/milky way.

Its really simple stuff, but pretty slick thanks to curl_multi_exec.

<?php
// Form to fetch keywords
echo'<form method="get" action="">';
echo'<input type="text" name="keyword" />';
echo'<input type="submit" value="Query!" />';
echo'</form>';

// Prepare our keyword
$keyword = trim(str_replace(array('_','+'),' ',strip_tags($_GET['keyword'])));

// Build http header
$header[] = "Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"; 
$header[] = "Cache-Control: max-age=0"; 
$header[] = "Connection: keep-alive"; 
$header[] = "Keep-Alive: 300"; 
$header[] = "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7"; 
$header[] = "Accept-Language: en-us,en;q=0.5"; 
$header[] = "Pragma: ";

// User agent
$ua = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.4.154.29 Safari/525.19';

if ($keyword):
  // A keyword has been submitted 
 
 // Our search urls
 $nodes[] = 'http://search.yahoo.com/search?ei=UTF-8&fr=yfp-t-802&rs=more&rs=all&p='.urlencode($keyword);
 $nodes[] = 'http://www.google.com/search?hl=en&btnG=Google+Search&q='.urlencode($keyword);
 $nodes[] = 'http://uk.ask.com/web?search=search&dm=all&qsrc=0&o=312&l=dir&siteid=&q='.urlencode($keyword);
 $nodes[] = 'http://search.live.com/results.aspx?form=QBRE&q='.urlencode($keyword);
 $node_count = count($nodes);
 
 $curl_arr = array();
 $master = curl_multi_init();

 // Loop through urls
 for($i = 0; $i < $node_count; $i++):
  $url = $nodes[$i];
  $curl_arr[$i] = curl_init($url);
  curl_setopt($curl_arr[$i], CURLOPT_URL, $url); 
  curl_setopt($curl_arr[$i], CURLOPT_USERAGENT, $ua); 
  curl_setopt($curl_arr[$i], CURLOPT_HTTPHEADER, $header); 
  curl_setopt($curl_arr[$i], CURLOPT_ENCODING, 'gzip,deflate'); 
  curl_setopt($curl_arr[$i], CURLOPT_TIMEOUT, 10); 
  curl_setopt($curl_arr[$i], CURLOPT_RETURNTRANSFER, true);
  curl_setopt($curl_arr[$i], CURLOPT_FOLLOWLOCATION, true);
  curl_multi_add_handle($master, $curl_arr[$i]);
 endfor;

 do{curl_multi_exec($master, $running);}
 while($running > 0);

 // Get our keywords
 for($i = 0; $i < $node_count; $i++):
  $results = curl_multi_getcontent($curl_arr[$i]);
  switch($i):
   case 0: // yahoo.com
    preg_match_all("//=rs-top\">(.+?)<\/a>,<\/li>//", $results, $yah_keywords);
    break;
   case 1: // google.com
    preg_match_all("//<td style=\"padding:0 0 7px;padding-right:34px;vertical-align:top\"><a (.+?)&(.+?)\">(.+?)<\/a>//", $results, $goo_keywords);
    break;
   case 2: // ask.com
    preg_match_all("//<div class=\"zm\" ><a href=(.+?)ec:\'19\'\}\)\" >(.+?)<\/a><\/div>//", $results, $ask_keywords);
    break;
   case 3: // live.com
    preg_match_all("//<li><a href=(.+?);FORM=QSRE(.+?),this\)\">(.+?)<\/a> <\/li>//", $results, $msn_keywords);
    break;
  endswitch;
 endfor;

 // Join keywords and make sure the list is unique
 $wordz = array_unique(array_merge($yah_keywords[1], $goo_keywords[3], $ask_keywords[2], $msn_keywords[3]));

 $words = genKeywords($wordz); // HTML
 //$words = genKeywords($wordz,1); // TEXT

 // Good Place To Maybe Save Em?!
 // Make a folder in the same folder as this script, name it keywords, then uncomment below the save the keyword lists
// $file = str_replace(" ", "_", $keyword);
// if ($fp = fopen('./keywords/'.$keyword.'.txt', "w+")):
//  fwrite($fp, genKeywords($wordz,1));
//  fclose($fp);
//  echo'<h3>Saved!</h3>';
// endif;
 
 // output!
 echo $words;

endif;

function genKeywords($kw, $linked=0)
{
 // Helper function for outputting the data
 // The second parameter if set to 1 will out the data as plain text
 $res = '';
 $rem = array(' ','+','-',);
 foreach ($kw as $keyword):
  $keyword = str_replace($rem, ' ', strip_tags($keyword));
  if (0 == $linked):
   $res.= '<a href="?keyword='.str_replace($rem, '_', $keyword).'">'.$keyword."</a><br />\n";
  else: $res.= trim(str_replace('_', ' ', $keyword))."\r\n";
  endif;
 endforeach;
 return $res;
}

?>
Keyword List Building Made Easy!

Scraping Yahoo Results

Thursday, 1 January 2009

Hoya!

Simple little script for ya here. This scraper basicaly pulls all the data it can for your given keyword.

Returns the amount of results, suggested keywords and 1st 10 results (broken into title, blurb and url).

I suggest you run this in a development only enviroment (Wampserver is a great suggestion for windows users new to PHP and wanting to play around with scripts!)

The Code:
<?php
# config
$yah_search = 'http://search.yahoo.com/search?ei=UTF-8&fr=yfp-t-802&rs=more&rs=all&p=';
$ua = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11';

$keyword = 'poker';

# Load the page
$yahoo_data = $yah_search.urlencode($keyword);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $yahoo_data);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_USERAGENT, $ua);
$yahoo_data = curl_exec($ch);
curl_close($ch);

# Get the search results total
preg_match_all("#1 - 10 of (.+?) for #", $yahoo_data, $yah_total);
foreach($yah_total[1] as $key=>$val):
$total.= $val;
endforeach;

# Get the suggested keywords
preg_match_all("#=rs-top\">(.+?)<\/a>,<\/li>#", $yahoo_data, $yah_keywords);
foreach($yah_keywords[1] as $key=>$val):
$keywords.= strip_tags($val).'<br />';
endforeach;

# Get the 1st 10 results
preg_match_all("#<h3><a class=\"yschttl\" (.+?) >(.+?)<\/h3>#", $yahoo_data, $title);
preg_match_all("#<div class=\"abstr\">(.+?)<\/div>#", $yahoo_data, $rs);
preg_match_all("#<span class=url>(.+?)<\/span>#", $yahoo_data, $urls);
foreach($title[2] as $key => $none):
$res.= '<b>'.strip_tags($title[2][$key]).'</b><br />'
.wordwrap(strip_tags($rs[1][$key])).'<br />'
.strip_tags($urls[1][$key])."<br /><br />\n";
endforeach;

# Output it all!
echo'<pre><scraped>';
echo $total.' Results<br /><br />';
echo'<b>Suggested Keywords</b>:<br />'. $keywords.'<br />';
echo $res;
echo'</scraped></pre>';

Really simple!

Maybe too simple? What if you want more than 10 results?To show 100 results, simply look for 1st line:

$yah_search = 'http://search.yahoo.com/search?ei=UTF-8&fr=yfp-t-802&rs=more&rs=all&p=';

And change it to:

$yah_search = 'http://search.yahoo.com/search?n=100&ei=UTF-8&va_vt=any&vo_vt=any&ve_vt=any&vp_vt=any&vd=all&vst=0&vf=all&vm=p&fl=0&fr=sfp&p=';

Maybe you want the urls to become hyperlinks?

Piece of piss! Just look for this:

foreach($title[2] as $key => $none):
$res.= '<b>'.strip_tags($title[2][$key]).'</b><br />'
.wordwrap(strip_tags($rs[1][$key])).'<br />'
.strip_tags($urls[1][$key])."<br /><br />\n";
endforeach;

And change it to:

foreach($title[2] as $key => $none):
$res.= '<b>'.strip_tags($title[2][$key]).'</b><br />'
    .wordwrap(strip_tags($rs[1][$key])).'<br />
    <a href="http://'.strip_tags($urls[1][$key])."\">".strip_tags($urls[1][$key])."</a><br /><br />\n";
endforeach;

There is loads you can do!

Hows about saving the results in a database and randomly choosing 10 blurbs for filling a space? Might add a solution if anyone asks!

For now, Enjoy, and pop back to this page for further ideas and implementations for this scraper!