|
Description:
The getHits function in this file takes as input a search term string, and returns as output a dict containing the number of web hits returned for that search term. The Google search engine and a SQLite database are used. See code for usage information.
Source: Text Source
import datetime, os.path, re, sqlite3, urllib2
def getHits(term,cond_enc=False,db_name='Hits.sqlite',max_age=30):
"""Get web search hits for a given term"""
assert (isinstance(term,str) or isinstance(term,unicode)), 'Term must be a string'
def getHitsDb(term,con,max_age):
"""Get web search hits for a given hits from a database if available"""
row=con.execute('SELECT Hits,DateTimeUTC FROM Hits WHERE Term=?', (hits['term'],)).fetchone()
if row==None:
hits['inDb']=False
else:
row={'Hits':row['Hits'], 'DateTimeUTC':row['DateTimeUTC']}
row['DateTimeUTC']=datetime.datetime.strptime(row['DateTimeUTC'],'%Y-%m-%d %H:%M:%S')
row['Age']=datetime.datetime.utcnow()-row['DateTimeUTC']
row['Age']=row['Age'].days+row['Age'].seconds/float(datetime.timedelta.max.seconds)
if row['Age']>max_age:
con.execute('DELETE FROM Hits WHERE Term=?', (hits['term'],))
hits['inDb']=False
else:
hits['inDb']=True
hits['hits']=row['Hits']
return hits
def getHitsWeb(hits):
"""Get web search hits for a given term from a web search"""
url='http://google.com/search?'
hits['web search term']=urllib2.quote(hits['term'])
url=urllib2.Request('%sq=%s'%(url,hits['web search term']))
url.add_header('User-Agent','')
url=urllib2.urlopen(url).read()
hits['datetimeutc']=datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
hits['hits']=re.search('Results <b>1</b> - <b>10</b> of about <b>(?P<hits>.+?)</b> for <b>',url)
if hits['hits']!=None:
hits['hits']=hits['hits'].group('hits')
hits['hits']=hits['hits'].replace(',','')
hits['hits']=int(hits['hits'])
else:
hits['hits']=0
return hits
def setHitsDb(con,hits):
"""Store web search hits for a given term in the database"""
con.execute('INSERT INTO Hits (Term,Hits,DateTimeUTC) VALUES (?,?,?)', (hits['term'],hits['hits'],hits['datetimeutc']))
def createDb(db_name):
"""Create a database to store web search hits"""
con=sqlite3.connect(db_name,isolation_level=None)
con.execute("""CREATE TABLE `Hits` (`Term` CHAR PRIMARY KEY NOT NULL , `Hits` INTEGER NOT NULL , `DateTimeUTC` DATETIME NOT NULL )""")
if not os.path.isfile(db_name): createDb(db_name)
con=sqlite3.connect(db_name,isolation_level=None)
con.row_factory=sqlite3.Row
hits={'term':term}
if cond_enc and hits['term'].__contains__(' '): hits['term']='"%s"'%hits['term']
hits=getHitsDb(hits,con,max_age)
if not hits['inDb']:
hits=getHitsWeb(hits)
setHitsDb(con,hits)
return hits
def cleanDb(db_name='Hits.sqlite',max_age=30):
"""Delete entries from database that are older than the specified maximum age in days"""
min_datetime=datetime.datetime.utcnow()-datetime.timedelta(days=30)
min_datetime=min_datetime.strftime('%Y-%m-%d %H:%M:%S')
con=sqlite3.connect(db_name,isolation_level=None)
con.execute('DELETE FROM Hits WHERE DateTimeUTC<?', (min_datetime,))
Discussion:
|