Multi SERP, Suggested Keywords Scraper

Tuesday, 6 January 2009

It uses curl_multi_exec to scrape live, ask, yahoo and google for suggested keywords based on a keyword you provide.

At the moment it just outputs the keywords as hyperlinks to fetch longer tails keywords based on the actual suggestion. But, with just a minor tweak it can output as plain text and with a couple of lines of extra code could even save em to a file/db/milky way.

Its really simple stuff, but pretty slick thanks to curl_multi_exec.

<?php
// Form to fetch keywords
echo'<form method="get" action="">';
echo'<input type="text" name="keyword" />';
echo'<input type="submit" value="Query!" />';
echo'</form>';

// Prepare our keyword
$keyword = trim(str_replace(array('_','+'),' ',strip_tags($_GET['keyword'])));

// Build http header
$header[] = "Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"; 
$header[] = "Cache-Control: max-age=0"; 
$header[] = "Connection: keep-alive"; 
$header[] = "Keep-Alive: 300"; 
$header[] = "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7"; 
$header[] = "Accept-Language: en-us,en;q=0.5"; 
$header[] = "Pragma: ";

// User agent
$ua = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.4.154.29 Safari/525.19';

if ($keyword):
  // A keyword has been submitted 
 
 // Our search urls
 $nodes[] = 'http://search.yahoo.com/search?ei=UTF-8&fr=yfp-t-802&rs=more&rs=all&p='.urlencode($keyword);
 $nodes[] = 'http://www.google.com/search?hl=en&btnG=Google+Search&q='.urlencode($keyword);
 $nodes[] = 'http://uk.ask.com/web?search=search&dm=all&qsrc=0&o=312&l=dir&siteid=&q='.urlencode($keyword);
 $nodes[] = 'http://search.live.com/results.aspx?form=QBRE&q='.urlencode($keyword);
 $node_count = count($nodes);
 
 $curl_arr = array();
 $master = curl_multi_init();

 // Loop through urls
 for($i = 0; $i < $node_count; $i++):
  $url = $nodes[$i];
  $curl_arr[$i] = curl_init($url);
  curl_setopt($curl_arr[$i], CURLOPT_URL, $url); 
  curl_setopt($curl_arr[$i], CURLOPT_USERAGENT, $ua); 
  curl_setopt($curl_arr[$i], CURLOPT_HTTPHEADER, $header); 
  curl_setopt($curl_arr[$i], CURLOPT_ENCODING, 'gzip,deflate'); 
  curl_setopt($curl_arr[$i], CURLOPT_TIMEOUT, 10); 
  curl_setopt($curl_arr[$i], CURLOPT_RETURNTRANSFER, true);
  curl_setopt($curl_arr[$i], CURLOPT_FOLLOWLOCATION, true);
  curl_multi_add_handle($master, $curl_arr[$i]);
 endfor;

 do{curl_multi_exec($master, $running);}
 while($running > 0);

 // Get our keywords
 for($i = 0; $i < $node_count; $i++):
  $results = curl_multi_getcontent($curl_arr[$i]);
  switch($i):
   case 0: // yahoo.com
    preg_match_all("//=rs-top\">(.+?)<\/a>,<\/li>//", $results, $yah_keywords);
    break;
   case 1: // google.com
    preg_match_all("//<td style=\"padding:0 0 7px;padding-right:34px;vertical-align:top\"><a (.+?)&(.+?)\">(.+?)<\/a>//", $results, $goo_keywords);
    break;
   case 2: // ask.com
    preg_match_all("//<div class=\"zm\" ><a href=(.+?)ec:\'19\'\}\)\" >(.+?)<\/a><\/div>//", $results, $ask_keywords);
    break;
   case 3: // live.com
    preg_match_all("//<li><a href=(.+?);FORM=QSRE(.+?),this\)\">(.+?)<\/a> <\/li>//", $results, $msn_keywords);
    break;
  endswitch;
 endfor;

 // Join keywords and make sure the list is unique
 $wordz = array_unique(array_merge($yah_keywords[1], $goo_keywords[3], $ask_keywords[2], $msn_keywords[3]));

 $words = genKeywords($wordz); // HTML
 //$words = genKeywords($wordz,1); // TEXT

 // Good Place To Maybe Save Em?!
 // Make a folder in the same folder as this script, name it keywords, then uncomment below the save the keyword lists
// $file = str_replace(" ", "_", $keyword);
// if ($fp = fopen('./keywords/'.$keyword.'.txt', "w+")):
//  fwrite($fp, genKeywords($wordz,1));
//  fclose($fp);
//  echo'<h3>Saved!</h3>';
// endif;
 
 // output!
 echo $words;

endif;

function genKeywords($kw, $linked=0)
{
 // Helper function for outputting the data
 // The second parameter if set to 1 will out the data as plain text
 $res = '';
 $rem = array(' ','+','-',);
 foreach ($kw as $keyword):
  $keyword = str_replace($rem, ' ', strip_tags($keyword));
  if (0 == $linked):
   $res.= '<a href="?keyword='.str_replace($rem, '_', $keyword).'">'.$keyword."</a><br />\n";
  else: $res.= trim(str_replace('_', ' ', $keyword))."\r\n";
  endif;
 endforeach;
 return $res;
}

?>
Keyword List Building Made Easy!