search_classes.py

   1 #!/usr/bin/env python
   2
   3 from HTMLParser import HTMLParser, HTMLParseError
   4 import urllib
   5
   6 class pandora_fetch(HTMLParser):
   7     """ This class should be initiated with a Pandora account username. It exposes a list of tracks `self.tracks` and a dictionary of title->artist pairs `tracks`.
   8     """
   9
  10     def __init__(self, user):
  11         HTMLParser.__init__(self)
  12         self.user = user
  13         self.stations = []
  14         self.tracks = {}
  15         self.__in_row = False
  16         self.__in_track = False
  17         self.__current_track = None
  18         self.__mode = 'stations'
  19         page = urllib.urlopen('http://www.pandora.com/favorites/profile_tablerows_station.vm?webname=' + self.user).read()
  20         self.feed(page)
  21         self.__mode = 'tracks'
  22         if len(self.stations) == 0:
  23             print 'Are you sure your pandora profile is public? Can\'t seem to find any stations listed with your account.'
  24             return 1
  25         for station in self.stations:
  26             page = urllib.urlopen('http://www.pandora.com/favorites/station_tablerows_thumb_up.vm?token=' + station + '&sort_col=thumbsUpDate').read()
  27             self.feed(page)
  28
  29     def handle_starttag(self, tag, attrs):
  30         if self.__mode == 'stations':
  31             if tag == 'div':
  32                 for attr, value in attrs:
  33                     if attr == 'class' and value == 'station_table_row':
  34                         self.__in_row = True
  35                         continue
  36             if self.__in_row and tag == 'a':
  37                 for attr, value in attrs:
  38                     if self.__in_row and attr == 'href':
  39                         self.stations.append(value[10:])
  40                         continue
  41         if self.__mode == 'tracks':
  42             if tag == 'span':
  43                 for attr, value in attrs:
  44                     if attr == 'class' and value == 'track_title':
  45                         self.__in_track = True
  46                         continue
  47                     if attr == 'tracktitle':
  48                         self.__current_track = value
  49
  50     def handle_data(self, text):
  51         if self.__in_track:
  52             self.tracks[self.__current_track] = text
  53
  54     def handle_endtag(self, tag):
  55         if tag == 'div':
  56             self.__in_row = False
  57         if tag == 'a':
  58             self.__in_track = False
  59             self.__current_track = None
  60
  61
  62 class search_youtube(HTMLParser):
  63     """ This class should be initiated with a list of search terms. It exposes a list of YouTube video ids `self.track_ids`.  """
  64
  65     def __init__(self, search_terms):
  66         self.track_ids = []
  67         for search in search_terms:
  68             HTMLParser.__init__(self)
  69             page = ''
  70             self.__in_search_results = False
  71             search = urllib.quote_plus(search)
  72             url = 'http://youtube.com/results?search_query='
  73             connection = urllib.urlopen(url + search)
  74             encoding = connection.headers.getparam('charset')
  75             page = connection.read()
  76             page = page.decode(encoding)
  77             try:
  78                 self.feed(page)
  79             except UnicodeDecodeError:
  80                 print 'problem decoding', url + search
  81             except UnicodeEncodeError:
  82                 print 'problem encoding', url + search
  83             except HTMLParseError:
  84                 # There is no way to override HTMLParseError and
  85                 # continue parsing, see:
  86                 # http://bugs.python.org/issue755660
  87                 # But the data is there!
  88                 print 'problem parsing', url + search
  89             except found_video:
  90                 pass
  91
  92     def handle_starttag(self, tag, attrs):
  93         if tag == 'div':
  94             for attr, value in attrs:
  95                 if attr == 'id' and value == 'search-results':
  96                     self.__in_search_results = True
  97         if self.__in_search_results:
  98             for attr, value in attrs:
  99                 if attr == 'href' and value[:-11] == '/watch?v=' and len(value[9:]) == 11:
 100                     self.track_ids.append(value[9:])
 101                     self.__in_search_results = False
 102                     #self.reset()
 103                     # Calling self.reset() causes the following error:
 104
 105                     # File "/usr/lib/python2.6/HTMLParser.py", line 108, in feed self.goahead(0)
 106                     # File "/usr/lib/python2.6/HTMLParser.py", line 148, in goahead k = self.parse_starttag(i)
 107                     # File "/usr/lib/python2.6/HTMLParser.py", line 229, in parse_starttag endpos = self.check_for_whole_start_tag(i)
 108                     # File "/usr/lib/python2.6/HTMLParser.py", line 305, in check_for_whole_start_tag
 109                     # raise AssertionError("we should not get here!")
 110
 111                     # I can't figure out why that's happening. I've
 112                     # discovered that calling HTMLParser.__init__(self)
 113                     # inside the search term loop in self.__init__ also
 114                     # resets the instance. The instance must be reset to
 115                     # accept a new page with self.feed(). Until a better
 116                     # solution is found:
 117                     raise found_video
 118
 119 class found_video(BaseException):
 120     """ Exception class to throw after finding a video to stop HTMLParser. """
 121     pass