Private GIT

Skip to content
Snippets Groups Projects
Commit 3dec7692 authored by Nick Sologoub's avatar Nick Sologoub
Browse files

Fix SiCKRAGETV/sickrage-issues/issues/3347

HD-Torrents has some invalid html on the page with search results. Using
the default html parser wasn't returning the correct data. Substituted
it with html5 parser to fix the problem.

Update soup to v4

Cutting out invalid portions of html before feeding it to parser.

Added error handling and case insensitive match

Fixed detection of seeders/leechers and improved size detection
parent 47c43c01
Branches
Tags
No related merge requests found
...@@ -30,7 +30,7 @@ from sickbeard import helpers ...@@ -30,7 +30,7 @@ from sickbeard import helpers
from sickbeard import show_name_helpers from sickbeard import show_name_helpers
from sickrage.helper.exceptions import AuthException from sickrage.helper.exceptions import AuthException
import requests import requests
from BeautifulSoup import BeautifulSoup as soup from bs4 import BeautifulSoup
from unidecode import unidecode from unidecode import unidecode
from sickbeard.helpers import sanitizeSceneName from sickbeard.helpers import sanitizeSceneName
from datetime import datetime from datetime import datetime
...@@ -119,7 +119,18 @@ class HDTorrentsProvider(generic.TorrentProvider): ...@@ -119,7 +119,18 @@ class HDTorrentsProvider(generic.TorrentProvider):
logger.log("No data returned from provider", logger.DEBUG) logger.log("No data returned from provider", logger.DEBUG)
continue continue
html = soup(data) # Search result page contains some invalid html that prevents html parser from returning all data.
# We cut everything before the table that contains the data we are interested in thus eliminating
# the invalid html portions
try:
index = data.lower().index('<table class="mainblockcontenttt"')
except ValueError:
logger.log(u"Could not find table of torrents mainblockcontenttt", logger.ERROR)
continue
data = data[index:]
html = BeautifulSoup(data, 'html5lib')
if not html: if not html:
logger.log("No html data parsed from provider", logger.DEBUG) logger.log("No html data parsed from provider", logger.DEBUG)
continue continue
...@@ -145,25 +156,28 @@ class HDTorrentsProvider(generic.TorrentProvider): ...@@ -145,25 +156,28 @@ class HDTorrentsProvider(generic.TorrentProvider):
if not cells: if not cells:
continue continue
title = download_url = seeders = leechers = None title = download_url = seeders = leechers = size = None
size = 0
for cell in cells: for cell in cells:
try: try:
if None is title and cell.get('title') and cell.get('title') in 'Download': if None is title and cell.get('title') and cell.get('title') in 'Download':
title = re.search('f=(.*).torrent', cell.a['href']).group(1).replace('+', '.') title = re.search('f=(.*).torrent', cell.a['href']).group(1).replace('+', '.')
download_url = self.urls['home'] % cell.a['href'] download_url = self.urls['home'] % cell.a['href']
continue
if None is seeders and cell.get('class')[0] and cell.get('class')[0] in 'green' 'yellow' 'red': if None is seeders and cell.get('class')[0] and cell.get('class')[0] in 'green' 'yellow' 'red':
seeders = int(cell.text) seeders = int(cell.text)
if not seeders: if not seeders:
seeders = 1 seeders = 1
continue
elif None is leechers and cell.get('class')[0] and cell.get('class')[0] in 'green' 'yellow' 'red': elif None is leechers and cell.get('class')[0] and cell.get('class')[0] in 'green' 'yellow' 'red':
leechers = int(cell.text) leechers = int(cell.text)
if not leechers: if not leechers:
seeders = 0 leechers = 0
continue
# Need size for failed downloads handling # Need size for failed downloads handling
if re.match(r'[0-9]+,?\.?[0-9]* [KkMmGg]+[Bb]+', cells[7].text): if size is None:
size = self._convertSize(cells[7].text) if re.match(r'[0-9]+,?\.?[0-9]* [KkMmGg]+[Bb]+', cell.text):
size = self._convertSize(cell.text)
if not size: if not size:
size = -1 size = -1
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment