Merge branch 'htmlparser' into dev
[i_like_pandora.git] / search_classes.py
1 #!/usr/bin/env python
2
3 from HTMLParser import HTMLParser, HTMLParseError
4 import urllib
5 from fetch_pandora import pandora_fetch
6
7 class pandora_fetch(HTMLParser):
8 """ This class should be initiated with a Pandora account username. It exposes a list of tracks `self.tracks` and a dictionary of title->artist pairs `tracks`.
9 """
10
11 def __init__(self, user):
12 HTMLParser.__init__(self)
13 self.user = user
14 self.stations = []
15 self.tracks = {}
16 self.__in_row = False
17 self.__in_track = False
18 self.__current_track = None
19 self.__mode = 'stations'
20 page = urllib.urlopen('http://www.pandora.com/favorites/profile_tablerows_station.vm?webname=' + self.user).read()
21 self.feed(page)
22 self.__mode = 'tracks'
23 if len(self.stations) == 0:
24 print 'Are you sure your pandora profile is public? Can\'t seem to find any stations listed with your account.'
25 return 1
26 for station in self.stations:
27 page = urllib.urlopen('http://www.pandora.com/favorites/station_tablerows_thumb_up.vm?token=' + station + '&sort_col=thumbsUpDate').read()
28 self.feed(page)
29
30 def handle_starttag(self, tag, attrs):
31 if self.__mode == 'stations':
32 if tag == 'div':
33 for attr, value in attrs:
34 if attr == 'class' and value == 'station_table_row':
35 self.__in_row = True
36 continue
37 if self.__in_row and tag == 'a':
38 for attr, value in attrs:
39 if self.__in_row and attr == 'href':
40 self.stations.append(value[10:])
41 continue
42 if self.__mode == 'tracks':
43 if tag == 'span':
44 for attr, value in attrs:
45 if attr == 'class' and value == 'track_title':
46 self.__in_track = True
47 continue
48 if attr == 'tracktitle':
49 self.__current_track = value
50
51 def handle_data(self, text):
52 if self.__in_track:
53 self.tracks[self.__current_track] = text
54
55 def handle_endtag(self, tag):
56 if tag == 'div':
57 self.__in_row = False
58 if tag == 'a':
59 self.__in_track = False
60 self.__current_track = None
61
62
63 class search_youtube(HTMLParser):
64 """ This class should be initiated with a list of search terms. It exposes a list of YouTube video ids `self.track_ids`. """
65
66 def __init__(self, search_terms):
67 self.track_ids = []
68 for search in search_terms:
69 HTMLParser.__init__(self)
70 page = ''
71 self.__in_search_results = False
72 search = urllib.quote_plus(search)
73 url = 'http://youtube.com/results?search_query='
74 connection = urllib.urlopen(url + search)
75 encoding = connection.headers.getparam('charset')
76 page = connection.read()
77 page = page.decode(encoding)
78 try:
79 self.feed(page)
80 except UnicodeDecodeError:
81 print 'problem decoding', url + search
82 except UnicodeEncodeError:
83 print 'problem encoding', url + search
84 except HTMLParseError:
85 # There is no way to override HTMLParseError and
86 # continue parsing, see:
87 # http://bugs.python.org/issue755660
88 # But the data is there!
89 print 'problem parsing', url + search
90 except found_video:
91 pass
92
93 def handle_starttag(self, tag, attrs):
94 if tag == 'div':
95 for attr, value in attrs:
96 if attr == 'id' and value == 'search-results':
97 self.__in_search_results = True
98 if self.__in_search_results:
99 for attr, value in attrs:
100 if attr == 'href' and value[:-11] == '/watch?v=' and len(value[9:]) == 11:
101 self.track_ids.append(value[9:])
102 self.__in_search_results = False
103 #self.reset()
104 # Calling self.reset() causes the following error:
105
106 # File "/usr/lib/python2.6/HTMLParser.py", line 108, in feed self.goahead(0)
107 # File "/usr/lib/python2.6/HTMLParser.py", line 148, in goahead k = self.parse_starttag(i)
108 # File "/usr/lib/python2.6/HTMLParser.py", line 229, in parse_starttag endpos = self.check_for_whole_start_tag(i)
109 # File "/usr/lib/python2.6/HTMLParser.py", line 305, in check_for_whole_start_tag
110 # raise AssertionError("we should not get here!")
111
112 # I can't figure out why that's happening. I've
113 # discovered that calling HTMLParser.__init__(self)
114 # inside the search term loop in self.__init__ also
115 # resets the instance. The instance must be reset to
116 # accept a new page with self.feed(). Until a better
117 # solution is found:
118 raise found_video
119
120 class found_video(BaseException):
121 """ Exception class to throw after finding a video to stop HTMLParser. """
122 pass