Welcome, guest | Sign In | My Account | Store | Cart

Advogato (http://www.advogato.org) exports members' diaries in a simple XML format. This script fetches the entries and stores them in a dictionary keyed by date. I assume it can also be used with other virgule sites, such as http:///www.badvogato.org.

Python, 72 lines
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/env python

import sgmllib, string, urllib

class DiaryParser(sgmllib.SGMLParser):
    
    def __init__(self):
        sgmllib.SGMLParser.__init__(self)
        self.entries = []
        self.dates = [] 
        self.inHtml = 0
        self.inDate = 0
        self.data = ""
        
    def handle_data(self, data):
        self.data = self.data + data
    
    def unknown_starttag(self, tag, attrs):
        pass
                
    def unknown_endtag(self, tag):
        pass

    def start_html(self, attributes):
        self.inHtml = 1
        self.data = ""
        self.setliteral()
    
    def end_html(self):
        self.entries.append(self.data)
        self.inHtml = 0
    
    def start_date(self, attributes):
        self.data = ""
        self.setliteral()
    
    def end_html(self):
        self.entries.append(self.data)
        self.inHtml = 0
    
    def start_date(self, attributes):
        self.data = ""
        self.inDate = 1
    
    def end_date(self):
        self.dates.append(self.data)
        self.inDate = 0
        

def getEntries(person):
    """ Fetch a Advogato member's diary and return a dictionary in the form
        { date : entry, ... } 
    """
    
    parser = DiaryParser()
    f = urllib.urlopen("http://www.advogato.org/person/%s/diary.xml" % urllib.quote(person))
    
    s = f.read(8192)
    while s:
        parser.feed(s)
        s = f.read(8192)
    
    parser.close()
    result = {}
    for d, e in map(None, parser.dates, parser.entries):
        result[d] = e
    return result


if __name__=='__main__':
    import sys
    print getEntries(sys.argv[1])