Welcome, guest | Sign In | My Account | Store | Cart

The getHits function in this file takes as input a search term string, and returns as output a dict containing the number of web hits returned for that search term. The Google search engine and a SQLite database are used. See code for usage information.

Python, 127 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# Google Hits 1.01
# Date: 2008/06/08
# License: As-is; public domain
# Prerequisites: Python 2.5.2

# Description:
# The getHits function in this file takes as input a search term string, and returns as output a dict containing the number of web hits returned for that search term. The Google search engine and a SQLite database are used.
# The optional input arguments are:
#  - cond_enc (bool) (default is False): enclose the search term in quotes if it contains more than one word
#  - db_name (default is 'Hits.sqlite'): the name of the database to use for caching hits
#  - max_age (default is 30): the maximum acceptable age in days of cached hits

# Usage:
# from getHits import getHits
# hits=getHits('sample search term')['hits']

# Keywords:
# hits, count, popularity
# web search hits, web hits, Google hits

# Import needed modules
import datetime, os.path, re, sqlite3, urllib2

def getHits(term,cond_enc=False,db_name='Hits.sqlite',max_age=30):
    """Get web search hits for a given term"""

    # Validate term
    assert (isinstance(term,str) or isinstance(term,unicode)), 'Term must be a string'

    def getHitsDb(term,con,max_age):
        """Get web search hits for a given term from a database if available"""

        # Get hits from database if available
        row=con.execute('SELECT Hits,DateTimeUTC FROM Hits WHERE Term=?', (hits['term'],)).fetchone()

        # Determine hits from database as available
        if row==None:
            hits['inDb']=False
        else:

            # Postprocess row
            row={'Hits':row['Hits'], 'DateTimeUTC':row['DateTimeUTC']}
            row['DateTimeUTC']=datetime.datetime.strptime(row['DateTimeUTC'],'%Y-%m-%d %H:%M:%S')
            row['Age']=datetime.datetime.utcnow()-row['DateTimeUTC']
            row['Age']=row['Age'].days+row['Age'].seconds/float(datetime.timedelta.max.seconds)

            # Conditionally determine hits based on age of hits
            if row['Age']>max_age:
                con.execute('DELETE FROM Hits WHERE Term=?', (hits['term'],))
                hits['inDb']=False
            else:
                hits['inDb']=True
                hits['hits']=row['Hits']

        # Return updated hits
        return hits

    def getHitsWeb(hits):
        """Get web search hits for a given term from a web search"""

        # Set parameters
        url='http://google.com/search?' # Set web search URL

        # Generate web search term
        hits['web search term']=urllib2.quote(hits['term'])

        # Execute web search
        url=urllib2.Request('%sq=%s'%(url,hits['web search term']))
        url.add_header('User-Agent','')
        url=urllib2.urlopen(url).read()

        # Store date and time of web search
        hits['datetimeutc']=datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')

        # Parse web search results to determine hits
        hits['hits']=re.search('Results <b>1</b> - <b>10</b> of about <b>(?P<hits>.+?)</b> for <b>',url)
        if hits['hits']!=None:
            hits['hits']=hits['hits'].group('hits')
            hits['hits']=hits['hits'].replace(',','')
            hits['hits']=int(hits['hits'])
        else:
            hits['hits']=0

        # Return updated hits
        return hits

    def setHitsDb(con,hits):
        """Store web search hits for a given term in the database"""
        con.execute('INSERT INTO Hits (Term,Hits,DateTimeUTC) VALUES (?,?,?)', (hits['term'],hits['hits'],hits['datetimeutc']))

    def createDb(db_name):
        """Create a database to store web search hits"""
        con=sqlite3.connect(db_name,isolation_level=None)
        con.execute("""CREATE TABLE `Hits` (`Term` CHAR PRIMARY KEY  NOT NULL , `Hits` INTEGER NOT NULL , `DateTimeUTC` DATETIME NOT NULL )""")

    # Create database if not created
    if not os.path.isfile(db_name): createDb(db_name)

    # Initialize database
    con=sqlite3.connect(db_name,isolation_level=None)
    con.row_factory=sqlite3.Row

    # Create dict to store relevant info
    hits={'term':term}

    # Condtionally enclose term in quotes
    if cond_enc and hits['term'].__contains__(' '): hits['term']='"%s"'%hits['term']

    # Get hits
    hits=getHitsDb(hits,con,max_age) # Get hits from database if available
    if not hits['inDb']:
        hits=getHitsWeb(hits) # Get hits from web
        setHitsDb(con,hits) # Store hits in database

    # Return hits
    return hits

def cleanDb(db_name='Hits.sqlite',max_age=30):
    """Delete entries from database that are older than the specified maximum age in days"""

    # Determine threshold for date and time for old entries
    min_datetime=datetime.datetime.utcnow()-datetime.timedelta(days=30)
    min_datetime=min_datetime.strftime('%Y-%m-%d %H:%M:%S')

    # Delete old entries from database
    con=sqlite3.connect(db_name,isolation_level=None)
    con.execute('DELETE FROM Hits WHERE DateTimeUTC<?', (min_datetime,))