Solved decode error, can't escape HTMLParseError

author Dylan Lloyd <dylan@psu.edu>

Wed, 26 Jan 2011 06:29:02 +0000 (01:29 -0500)

committer Dylan Lloyd <dylan@psu.edu>

Wed, 26 Jan 2011 06:29:02 +0000 (01:29 -0500)
author Dylan Lloyd <dylan@psu.edu>
Wed, 26 Jan 2011 06:29:02 +0000 (01:29 -0500)
committer Dylan Lloyd <dylan@psu.edu>
Wed, 26 Jan 2011 06:29:02 +0000 (01:29 -0500)
diff --git a/youtube.py b/youtube.py

index 324b2c6..06ef7db 100755 (executable)
--- a/youtube.py
+++ b/youtube.py
@@ -1,6 +1,6 @@
  #!/usr/bin/env python
  
-from HTMLParser import HTMLParser
+from HTMLParser import HTMLParser, HTMLParseError
  import urllib
  from doit import pandora_fetch
  
@@ -21,9 +21,21 @@ class search_youtube(HTMLParser):
          for search in search_terms:
              self.__in_result = False
              search = urllib.quote_plus(search)
-            query = 'http://youtube.com/results?search_query='
-            page = urllib.urlopen(query + search).read()
-            self.feed(page)
+            url = 'http://youtube.com/results?search_query='
+            connection = urllib.urlopen(url + search)
+            encoding = connection.headers.getparam('charset')
+            page = connection.read()
+            page = page.decode(encoding)
+            try:
+                self.feed(page)
+            except UnicodeDecodeError:
+                print 'problem decoding', url + search
+            except HTMLParseError:
+                # There is no way to override HTMLParseError and
+                # continue parsing, see:
+                # http://bugs.python.org/issue755660
+                # But the data is there!
+                print 'problem parsing', url + search
  
      def handle_starttag(self, tag, attrs):
          if tag == 'div':
@@ -35,12 +47,8 @@ class search_youtube(HTMLParser):
                      track_id = value
              if self.__in_result and len(track_id[19:]) == 11:
                  self.track_ids.append(track_id[19:])
-                print track_id[19:]
                  self.__in_result = False
  
-    def handle_endtag(self, tag):
-        pass
-
-
  
  results = search_youtube(searches)
+print results.track_ids
author	Dylan Lloyd <dylan@psu.edu>
	Wed, 26 Jan 2011 06:29:02 +0000 (01:29 -0500)
committer	Dylan Lloyd <dylan@psu.edu>
	Wed, 26 Jan 2011 06:29:02 +0000 (01:29 -0500)