Solved decode error, can't escape HTMLParseError
authorDylan Lloyd <dylan@psu.edu>
Wed, 26 Jan 2011 06:29:02 +0000 (01:29 -0500)
committerDylan Lloyd <dylan@psu.edu>
Wed, 26 Jan 2011 06:29:02 +0000 (01:29 -0500)
youtube.py

index 324b2c6..06ef7db 100755 (executable)
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-from HTMLParser import HTMLParser
+from HTMLParser import HTMLParser, HTMLParseError
 import urllib
 from doit import pandora_fetch
 
@@ -21,9 +21,21 @@ class search_youtube(HTMLParser):
         for search in search_terms:
             self.__in_result = False
             search = urllib.quote_plus(search)
-            query = 'http://youtube.com/results?search_query='
-            page = urllib.urlopen(query + search).read()
-            self.feed(page)
+            url = 'http://youtube.com/results?search_query='
+            connection = urllib.urlopen(url + search)
+            encoding = connection.headers.getparam('charset')
+            page = connection.read()
+            page = page.decode(encoding)
+            try:
+                self.feed(page)
+            except UnicodeDecodeError:
+                print 'problem decoding', url + search
+            except HTMLParseError:
+                # There is no way to override HTMLParseError and
+                # continue parsing, see:
+                # http://bugs.python.org/issue755660
+                # But the data is there!
+                print 'problem parsing', url + search
 
     def handle_starttag(self, tag, attrs):
         if tag == 'div':
@@ -35,12 +47,8 @@ class search_youtube(HTMLParser):
                     track_id = value
             if self.__in_result and len(track_id[19:]) == 11:
                 self.track_ids.append(track_id[19:])
-                print track_id[19:]
                 self.__in_result = False
 
-    def handle_endtag(self, tag):
-        pass
-
-
 
 results = search_youtube(searches)
+print results.track_ids