Fix SiCKRAGETV/sickrage-issues/issues/3347

HD-Torrents has some invalid html on the page with search results. Using the default html parser wasn't returning the correct data. Substituted it with html5 parser to fix the problem. Update soup to v4 Cutting out invalid portions of html before feeding it to parser. Added error handling and case insensitive match Fixed detection of seeders/leechers and improved size detection

Fix SiCKRAGETV/sickrage-issues/issues/3347
3dec7692 · Nick Sologoub · 47c43c01 · 3dec7692
Commit 3dec7692 authored Oct 17, 2015 by Nick Sologoub
--- a/sickbeard/providers/hdtorrents.py
+++ b/sickbeard/providers/hdtorrents.py
@@ -30,7 +30,7 @@ from sickbeard import helpers
 from sickbeard import show_name_helpers
 from sickrage.helper.exceptions import AuthException
 import requests
-from BeautifulSoup import BeautifulSoup as soup
+from bs4 import BeautifulSoup
 from unidecode import unidecode
 from sickbeard.helpers import sanitizeSceneName
 from datetime import datetime
@@ -119,7 +119,18 @@ class HDTorrentsProvider(generic.TorrentProvider):
                    logger.log("No data returned from provider", logger.DEBUG)
                    continue
-                html = soup(data)
+                # Search result page contains some invalid html that prevents html parser from returning all data.
+                # We cut everything before the table that contains the data we are interested in thus eliminating
+                # the invalid html portions
+                try:
+                    index = data.lower().index('<table class="mainblockcontenttt"')
+                except ValueError:
+                    logger.log(u"Could not find table of torrents mainblockcontenttt", logger.ERROR)
+                    continue
+                data = data[index:]
+                html = BeautifulSoup(data, 'html5lib')
                if not html:
                    logger.log("No html data parsed from provider", logger.DEBUG)
                    continue
@@ -145,25 +156,28 @@ class HDTorrentsProvider(generic.TorrentProvider):
                        if not cells:
                            continue
-                        title = download_url = seeders = leechers = None
+                        title = download_url = seeders = leechers = size = None
-                        size = 0
                        for cell in cells:
                            try:
                                if None is title and cell.get('title') and cell.get('title') in 'Download':
                                    title = re.search('f=(.*).torrent', cell.a['href']).group(1).replace('+', '.')
                                    download_url = self.urls['home'] % cell.a['href']
+                                    continue
                                if None is seeders and cell.get('class')[0] and cell.get('class')[0] in 'green' 'yellow' 'red':
                                    seeders = int(cell.text)
                                    if not seeders:
                                        seeders = 1
+                                        continue
                                elif None is leechers and cell.get('class')[0] and cell.get('class')[0] in 'green' 'yellow' 'red':
                                    leechers = int(cell.text)
                                    if not leechers:
-                                        seeders = 0
+                                        leechers = 0
+                                        continue
                                # Need size for failed downloads handling
-                                if re.match(r'[0-9]+,?\.?[0-9]* [KkMmGg]+[Bb]+', cells[7].text):
+                                if size is None:
-                                    size = self._convertSize(cells[7].text)
+                                    if re.match(r'[0-9]+,?\.?[0-9]* [KkMmGg]+[Bb]+', cell.text):
+                                        size = self._convertSize(cell.text)
                                        if not size:
                                            size = -1