Private GIT

Skip to content
Snippets Groups Projects
Commit 6739e03f authored by miigotu's avatar miigotu
Browse files

Rewrite TPB to use bs4 instead of the ridiculously process intensive regex

parent 6ff4bfca
No related branches found
No related tags found
No related merge requests found
......@@ -22,8 +22,8 @@ import posixpath # Must use posixpath
from urllib import urlencode
from sickbeard import logger
from sickbeard import tvcache
from sickbeard.common import USER_AGENT
from sickrage.helper.common import convert_size
from sickbeard.bs4_parser import BS4Parser
from sickrage.helper.common import try_int, convert_size
from sickrage.providers.torrent.TorrentProvider import TorrentProvider
......@@ -41,22 +41,21 @@ class ThePirateBayProvider(TorrentProvider): # pylint: disable=too-many-instanc
self.cache = ThePirateBayCache(self)
self.url = 'https://thepiratebay.se/'
self.urls = {
'base_url': 'https://thepiratebay.se/',
'search': 'https://thepiratebay.se/s/',
'rss': 'https://thepiratebay.se/tv/latest'
'search': self.url + 's/',
'rss': self.url + 'tv/latest'
}
self.url = self.urls['base_url']
self.custom_url = None
self.headers.update({'User-Agent': USER_AGENT})
def search(self, search_strings, age=0, ep_obj=None): # pylint: disable=too-many-locals, too-many-branches
results = []
"""
205 = SD, 208 = HD, 200 = All Videos
https://pirateproxy.pl/s/?q=Game of Thrones&type=search&orderby=7&page=0&category=200
"""
self.search_params = {
search_params = {
'q': '',
'type': 'search',
'orderby': 7,
......@@ -64,62 +63,78 @@ class ThePirateBayProvider(TorrentProvider): # pylint: disable=too-many-instanc
'category': 200
}
self.re_title_url = r'/torrent/(?P<id>\d+)/(?P<title>.*?)".+?(?P<url>magnet.*?)".+?Size (?P<size>[\d\.]*&nbsp;[TGKMiB]{2,3}).+?(?P<seeders>\d+)</td>.+?(?P<leechers>\d+)</td>'
def search(self, search_strings, age=0, ep_obj=None): # pylint: disable=too-many-locals
results = []
for mode in search_strings:
items = []
logger.log(u"Search Mode: %s" % mode, logger.DEBUG)
for search_string in search_strings[mode]:
self.search_params.update({'q': search_string.strip()})
if mode != 'RSS':
logger.log(u"Search string: " + search_string, logger.DEBUG)
logger.log(u"Search string: %s " % search_string, logger.DEBUG)
search_params['q'] = search_string.strip()
searchURL = self.urls[('search', 'rss')[mode == 'RSS']] + '?' + urlencode(self.search_params)
search_url = self.urls[('search', 'rss')[mode == 'RSS']] + '?' + urlencode(search_params)
if self.custom_url:
searchURL = posixpath.join(self.custom_url, searchURL.split(self.url)[1].lstrip('/')) # Must use posixpath
search_url = posixpath.join(self.custom_url, search_url.split(self.url)[1].lstrip('/')) # Must use posixpath
logger.log(u"Search URL: %s" % searchURL, logger.DEBUG)
data = self.get_url(searchURL)
logger.log(u"Search URL: %s" % search_url, logger.DEBUG)
data = self.get_url(search_url)
if not data:
logger.log(u'URL did not return data, maybe try a custom url, or a different one', logger.DEBUG)
continue
matches = re.compile(self.re_title_url, re.DOTALL).finditer(data)
for torrent in matches:
title = torrent.group('title')
download_url = torrent.group('url')
# id = int(torrent.group('id'))
seeders = int(torrent.group('seeders'))
leechers = int(torrent.group('leechers'))
torrent_size = torrent.group('size')
with BS4Parser(data, 'html5lib') as html:
torrent_table = html.find('table', id='searchResult')
torrent_rows = torrent_table.find_all('tr') if torrent_table else []
size = convert_size(torrent_size) or -1
# Continue only if one Release is found
if len(torrent_rows) < 2:
logger.log(u"Data returned from provider does not contain any torrents", logger.DEBUG)
continue
def process_column_header(th):
result = ''
if th.a:
result = th.a.get_text(strip=True)
if not result:
result = th.get_text(strip=True)
return result
labels = [process_column_header(label) for label in torrent_rows[0].find_all('th')]
for result in torrent_rows[1:]:
try:
cells = result.find_all('td')
title = result.find(class_='detName').get_text(strip=True)
download_url = result.find(title="Download this torrent using magnet")['href']
if not all([title, download_url]):
continue
# Filter unseeded torrent
seeders = try_int(cells[labels.index('SE')])
leechers = try_int(cells[labels.index('LE')])
if seeders < self.minseed or leechers < self.minleech:
if mode != 'RSS':
logger.log(u"Discarding torrent because it doesn't meet the minimum seeders or leechers: {0} (S:{1} L:{2})".format(title, seeders, leechers), logger.DEBUG)
continue
# Accept Torrent only from Good People for every Episode Search
if self.confirmed and re.search(r'(VIP|Trusted|Helper|Moderator)', torrent.group(0)) is None:
if self.confirmed and result.find(alt=re.compile(r'(VIP|Trusted|Helper|Moderator)')):
if mode != 'RSS':
logger.log(u"Found result %s but that doesn't seem like a trusted result so I'm ignoring it" % title, logger.DEBUG)
continue
# Convert size after all possible skip scenarios
torrent_size = cells[labels.index('Name')].find(class_='detDesc').get_text(strip=True).split(', ')[1]
torrent_size = re.sub(r'Size ([\d.]+).+([KMG]iB)', r'\1 \2', torrent_size)
size = convert_size(torrent_size) or -1
item = title, download_url, size, seeders, leechers
if mode != 'RSS':
logger.log(u"Found result: %s " % title, logger.DEBUG)
items.append(item)
except StandardError:
continue
# For each search mode sort all the items by seeders if available
items.sort(key=lambda tup: tup[3], reverse=True)
......@@ -141,7 +156,7 @@ class ThePirateBayCache(tvcache.TVCache):
self.minTime = 30
def _getRSSData(self):
search_params = {'RSS': ['']}
return {'entries': self.provider.search(search_params)}
search_strings = {'RSS': ['']}
return {'entries': self.provider.search(search_strings)}
provider = ThePirateBayProvider()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment