search_classes.py

   1 #!/usr/bin/env python
   2
   3 from HTMLParser import HTMLParser, HTMLParseError
   4 import urllib
   5 from fetch_pandora import pandora_fetch
   6
   7 class pandora_fetch(HTMLParser):
   8     """ This class should be initiated with a Pandora account username. It exposes a list of tracks `self.tracks` and a dictionary of title->artist pairs `tracks`.
   9     """
  10
  11     def __init__(self, user):
  12         HTMLParser.__init__(self)
  13         self.user = user
  14         self.stations = []
  15         self.tracks = {}
  16         self.__in_row = False
  17         self.__in_track = False
  18         self.__current_track = None
  19         self.__mode = 'stations'
  20         page = urllib.urlopen('http://www.pandora.com/favorites/profile_tablerows_station.vm?webname=' + self.user).read()
  21         self.feed(page)
  22         self.__mode = 'tracks'
  23         if len(self.stations) == 0:
  24             print 'Are you sure your pandora profile is public? Can\'t seem to find any stations listed with your account.'
  25             return 1
  26         for station in self.stations:
  27             page = urllib.urlopen('http://www.pandora.com/favorites/station_tablerows_thumb_up.vm?token=' + station + '&sort_col=thumbsUpDate').read()
  28             self.feed(page)
  29
  30     def handle_starttag(self, tag, attrs):
  31         if self.__mode == 'stations':
  32             if tag == 'div':
  33                 for attr, value in attrs:
  34                     if attr == 'class' and value == 'station_table_row':
  35                         self.__in_row = True
  36                         continue
  37             if self.__in_row and tag == 'a':
  38                 for attr, value in attrs:
  39                     if self.__in_row and attr == 'href':
  40                         self.stations.append(value[10:])
  41                         continue
  42         if self.__mode == 'tracks':
  43             if tag == 'span':
  44                 for attr, value in attrs:
  45                     if attr == 'class' and value == 'track_title':
  46                         self.__in_track = True
  47                         continue
  48                     if attr == 'tracktitle':
  49                         self.__current_track = value
  50
  51     def handle_data(self, text):
  52         if self.__in_track:
  53             self.tracks[self.__current_track] = text
  54
  55     def handle_endtag(self, tag):
  56         if tag == 'div':
  57             self.__in_row = False
  58         if tag == 'a':
  59             self.__in_track = False
  60             self.__current_track = None
  61
  62
  63 class search_youtube(HTMLParser):
  64     """ This class should be initiated with a list of search terms. It exposes a list of YouTube video ids `self.track_ids`.  """
  65
  66     def __init__(self, search_terms):
  67         self.track_ids = []
  68         for search in search_terms:
  69             HTMLParser.__init__(self)
  70             page = ''
  71             self.__in_search_results = False
  72             search = urllib.quote_plus(search)
  73             url = 'http://youtube.com/results?search_query='
  74             connection = urllib.urlopen(url + search)
  75             encoding = connection.headers.getparam('charset')
  76             page = connection.read()
  77             page = page.decode(encoding)
  78             try:
  79                 self.feed(page)
  80             except UnicodeDecodeError:
  81                 print 'problem decoding', url + search
  82             except UnicodeEncodeError:
  83                 print 'problem encoding', url + search
  84             except HTMLParseError:
  85                 # There is no way to override HTMLParseError and
  86                 # continue parsing, see:
  87                 # http://bugs.python.org/issue755660
  88                 # But the data is there!
  89                 print 'problem parsing', url + search
  90             except found_video:
  91                 pass
  92
  93     def handle_starttag(self, tag, attrs):
  94         if tag == 'div':
  95             for attr, value in attrs:
  96                 if attr == 'id' and value == 'search-results':
  97                     self.__in_search_results = True
  98         if self.__in_search_results:
  99             for attr, value in attrs:
 100                 if attr == 'href' and value[:-11] == '/watch?v=' and len(value[9:]) == 11:
 101                     self.track_ids.append(value[9:])
 102                     self.__in_search_results = False
 103                     #self.reset()
 104                     # Calling self.reset() causes the following error:
 105
 106                     # File "/usr/lib/python2.6/HTMLParser.py", line 108, in feed self.goahead(0)
 107                     # File "/usr/lib/python2.6/HTMLParser.py", line 148, in goahead k = self.parse_starttag(i)
 108                     # File "/usr/lib/python2.6/HTMLParser.py", line 229, in parse_starttag endpos = self.check_for_whole_start_tag(i)
 109                     # File "/usr/lib/python2.6/HTMLParser.py", line 305, in check_for_whole_start_tag
 110                     # raise AssertionError("we should not get here!")
 111
 112                     # I can't figure out why that's happening. I've
 113                     # discovered that calling HTMLParser.__init__(self)
 114                     # inside the search term loop in self.__init__ also
 115                     # resets the instance. The instance must be reset to
 116                     # accept a new page with self.feed(). Until a better
 117                     # solution is found:
 118                     raise found_video
 119
 120 class found_video(BaseException):
 121     """ Exception class to throw after finding a video to stop HTMLParser. """
 122     pass