#!/usr/bin/python # # goopy: python module for google searches. Version 0.5 # Written by Michael G. (mynameisfiber@gmail.com) (copyleft) # # This version can handle: # Websites (local or foreign language) # Video Results # Files (ie: pdf, ps, ppt, etc.) # Recommended Search Terms # # ===================================================================== # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # # ===================================================================== from urllib import quote_plus import urllib2 import re from textwrap import wrap class websearch: """A module to utilize Google web search""" def __init__(self, query): self.opener = urllib2.build_opener() self.find_results = re.compile("
.+?)\""+ " class=l>(?P.+?)</a>(<nobr>(.+?)</nobr>)?<table border=0 cellpadding=0 cellspacing=0>"+ "<tr><td class=\"j( hc)?\"><div class=std>(.+?)<br>") self.find_videos = re.compile("<td valign=top><a href=\"(?P<url>.+?)\" class=l>(?P<title>.+?)</a><br><font size=-1>(?P<desc>.+?)<br>") self.find_files = re.compile("\[(?P<type1>.{2,5})\]</b></font></span> <a href=\"(?P<url>.+?)\" class=l>"+ "(?P<title>.+?)</a><table border=0 cellpadding=0 cellspacing=0><tr><td class=\"j\"><div class=std>"+ "<span class=f>File Format:</span> (?P<type2>.+?)( - |<br>)(.+?)<br>(?P<desc>.+?)<br>") self.find_recommend = re.compile("<font color=\"#cc0000\" class=p>Did you mean( to search for)?: </font><a"+ " href=\"(.+?)\" class=p><b><i>(?P<recommend>.+?)</i></b></a>") self.find_num_results = re.compile("<font size=-1>Results <b>[0-9,]+</b> - <b>[0-9,]+</b> of about <b>(?P<results>[0-9,]+)</b>") self.clean_regex = re.compile("(<(/)?(.+?)>|&(.+?);)") self.results = [] self.query = query self.offset = 0 self.recommend = "" self.numResults = 0 self.iterate = 0 self.search(self.query) def clean(self,string): """Takes all HTML tags out of the search results""" return self.clean_regex.sub("",string) def format_width(self,string): return "\n\t".join(wrap(string,60)) def search(self, query, offset=0): """Preforms the search and retrieves the results at `offset`""" start="" if(offset>0): start="&start=%d"%offset request = urllib2.Request('http://www.google.com/search?q='+quote_plus(query)+start) self.opener.addheaders = [('User-Agent', 'google-cli')] data = self.opener.open(request).read() # file("test.html","w").writelines(data) try: self.numResults = self.find_num_results.findall(data)[0] except: self.numResults = 0 tmp = self.find_recommend.findall(data) if len(tmp) != 0: self.recommend = tmp[0][2] self.query = query self.offset = offset self.results.extend(self.extract_results(data)) return self.results def search_next(self): """Helper fuction to preform subsequent searches after the first""" return self.search(self.query, self.offset+10) def extract_results(self,data): """Goes through the retrieved HTML and extracts search results""" resultsWeb = self.find_results.findall(data) resultsVideo = self.find_videos.findall(data) resultsFiles = self.find_files.findall(data) return [{'url': x[1], 'title': "[%s] %s"%(self.clean(x[0]),self.clean(x[2])), 'desc': self.clean(x[6])} for x in resultsFiles] +\ [{'url': x[0], 'title': self.clean(x[1]), 'desc': self.clean(x[2])} for x in resultsVideo] +\ [{'url': x[1], 'title': self.clean(x[2]), 'desc': self.clean(x[6])} for x in resultsWeb] def show_results(self, index, number): """Shows the `number` results starting at `index` in a pretty format""" counter = 0 for x in self.get_results(index,number): print "%d) %s\n\t%s\n\t%s"%(index+counter,x["title"],x["url"],self.format_width(x["desc"])) counter+=1 def get_results(self, index, number): """Gets and returns results starting at index up to index+number in a list""" while(len(self.results)<index+number and self.numResults != 0): self.search_next() return self.results[index:index+number] def next(self): """Offeres the next value for iteration. If the result has not yet been aquired from google then it is now..""" self.iterate +=1 try: return self.get_results(self.iterate-1, 1)[0] except: raise StopIteration() def __iter__(self): """Setup iteration""" self.iterate = 0 return self def __repr__(self): """<type 'goopy.websearch'>""" return "<type 'goopy.websearch'>" def __len__(self): """Returns the amount of results found""" return len(self.results) def __getitem__(self,key): """Returns item or slice of items requested.""" try: high = key.stop except: high = key self.get_results(high,1) return self.results[key] def __str__(self): return "\n\n".join(["%s (%s)\n\t%s"%(x["title"],x["url"],self.format_width(x["desc"])) for x in self.results])