works, but fails unpleasantly when no result found
[patent-api.git] / patent.py
1 #!/usr/bin/env python
2
3 import json
4 import urllib2
5 from BeautifulSoup import BeautifulSoup
6
7 import cgi
8 import cgitb
9 cgitb.enable()
10
11 import os
12 from urlparse import urlparse
13 import urllib
14
15
16 patents = {}
17 query = ''
18
19
20 class Parse(object):
21 def __init__(self):
22 query = urllib.quote(urlparse(os.environ['REQUEST_URI']).query[2:])
23 usptoURL = 'http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&p=1&u=%2Fnetahtml%2FPTO%2Fsearch-bool.html&r=0&f=S&l=5&TERM1=' + query + '&FIELD1=&co1=AND&TERM2=&FIELD2=&d=PTXT';
24 response = urllib2.urlopen(usptoURL);
25 html = response.read()
26 self.soup = BeautifulSoup(html)
27 table = self.soup.findAll('table')[1]
28 first = True
29 for tr in table.findAll('tr'):
30 if first:
31 first = False
32 continue
33 i = 0
34 for td in tr.findAll('td'):
35 if i == 1:
36 self.GUID = str(td.a.text)
37 elif i == 3:
38 self.href = "http://patft.uspto.gov" + str(td.a['href'])
39 self.description = str(td.text.replace('\n','').replace('\t',''))
40 i += 1
41 patents[self.GUID] = {"href" : self.href, "description" : self.description};
42
43
44 if __name__ == "__main__":
45 parsed = Parse()
46 print "Content-Type: application/json", "\n\n", json.dumps(patents)