from __future__ import unicode_literals
from datetime import date, datetime
from .constants import Category, Language, Quality
[docs]class Parser:
"""
The Parser is a static class, responsible for parsing HTML elements and text.
It shouldn't be used directly.
:attr: TORRENTS_LIST_XPATH is a XPATH expression used to capture the torrent lists in range [4:-3] from the HTML parent table element.
However last() - 3 will be handled with Python slicing in `get_torrents_rows` method as it's more DRY than writing a really longer XPATH expression.
:attr: DATE_TAG_XPATH is a XPATH expression used to capture the HTML parent `tr`'s `td` element holding the date row.
:attr: DATE_STRPTIME_FORMAT is a `datetime`-compliant string used to parse the DATE_TAG's date text.
:attr: FIRST_ROW_XPATH is a XPATH used to capture the first torrent's table row's id, title, tracked_by, category_url and torrent_url (torrents consist of 2 table rows).
"""
TORRENTS_LIST_XPATH = '//*[@id="fslispc"]/table/tr/td[1]/table[6]/tr/td/table/tr[position() > 4]'
DATE_TAG_XPATH = './td[@class="added_today"]'
DATE_STRPTIME_FORMAT = '%A, %b %d, %Y'
FIRST_ROW_XPATH = './td/a | ./td/font'
@staticmethod
[docs] def get_torrents_rows(dom):
"""
Static method that gets the torrent list rows from the given `dom` by running `TORRENTS_LIST_XPATH` and trims the last() - 3 non-torrent rows, which are actually sorting preferences rows.
:param lxml.HtmlElement dom: the dom to operate on
:return: returns torrent rows
:rtype: list lxml.HtmlElement
"""
return dom.xpath(Parser.TORRENTS_LIST_XPATH)[:-3] # trim non-torrents
@staticmethod
[docs] def get_date_td(rows):
"""
Static method that gets the torrent data element containing the torrents' date. Executes :attr:`DATE_TAG_XPATH <DATE_TAG_XPATH>` on given `dom`.
:param list lxml.HtmlElement rows: the rows to search in
:return: table data containg torrents' date
:rtype: lxml.HtmlElement
"""
tds = rows.xpath(Parser.DATE_TAG_XPATH)
return tds[0] if tds else None
@staticmethod
[docs] def get_params(url, ignore_empty=False):
"""
Static method that parses a given `url` and retrieves `url`'s parameters. Could also ignore empty value parameters.
Handles parameters-only urls as `q=banana&peel=false`.
:param str url: url to parse
:param bool ignore_empty: ignore empty value parameter or not
:return: dictionary of params and their values
:rtype: dict
"""
try:
params_start_index = url.index('?')
except ValueError:
params_start_index = 0
params_string = url[params_start_index + 1:]
params_dict = {}
for pair in params_string.split('&'):
if not pair:
continue
splitted = pair.split('=')
param, value = splitted
if not value and ignore_empty:
continue
value = int(value) if value.isdigit() else value
params_dict[param] = value
return params_dict
@staticmethod
[docs] def parse_date(table_data):
"""
Static method that parses a given table data element with `Url.DATE_STRPTIME_FORMAT` and creates a `date` object from td's text contnet.
:param lxml.HtmlElement table_data: table_data tag to parse
:return: date object from td's text date
:rtype: datetime.date
"""
text = table_data.text.split('Added on ')
# Then it's 'Added today'. Hacky
if len(text) < 2:
return date.today()
# Looks like ['', 'Thursday, Mar 05, 2015']
return datetime.strptime(text[1], Parser.DATE_STRPTIME_FORMAT).date()
@staticmethod
[docs] def parse_first_row(row, url_instance):
"""
Static method that parses a given table row element by executing `Parser.FIRST_ROW_XPATH` and scrapping torrent's
id, title, tracked by status, category url and torrent url. Used specifically with a torrent's first table row.
:param lxml.HtmlElement row: row to parse
:param urls.Url url_instance: Url used to combine base url's with scrapped links from tr
:return: scrapped id, title, tracked by status, category url and torrent url
:rtype: list
"""
tags = row.xpath(Parser.FIRST_ROW_XPATH)
category_url = url_instance.combine(tags[0].get('href'))
title = unicode(tags[1].text)
# work with the incomplete URL to get str_id
torrent_url = tags[1].get('href')
str_id = torrent_url.split('details/')[1]
str_id = str_id[:-1] if str_id.endswith('/') else str_id
# complete the torrent URL with BASE_URL
torrent_url = url_instance.combine(torrent_url)
# means that torrent has external property
if len(tags) == 3:
# monkey patch the missing external query param
category_url += '&external=1'
tracked_by = '(external)'
else:
tracked_by = 'Demonoid'
return [str_id, title, tracked_by, category_url, torrent_url]
@staticmethod
[docs] def parse_second_row(row, url):
"""
Static method that parses a given table row element by using helper methods `Parser.parse_category_subcategory_and_or_quality`,
`Parser.parse_torrent_link` and scrapping torrent's category, subcategory, quality, language, user, user url, torrent link, size,
comments, times completed, seeders and leechers. Used specifically with a torrent's second table row.
:param lxml.HtmlElement row: row to parse
:param urls.Url url_instance: Url used to combine base url's with scrapped links from tr
:return: scrapped category, subcategory, quality, language, user, user url, torrent link, size, comments, times completed,
seeders and leechers
:rtype: list
"""
tags = row.findall('./td')
category, subcategory, quality, language = Parser.parse_torrent_properties(tags[0])
user_info = tags[1].find('./a')
user = user_info.text_content()
user_url = url.combine(user_info.get('href'))
# Two urls - one is spam, second is torrent url.
# Don't combine it with BASE_URL, since it's an absolute url.
torrent_link = Parser.parse_torrent_link(tags[2])
size = tags[3].text # as 10.5 GB
comments = tags[4].text
times_completed = tags[5].text
seeders = tags[6].text
leechers = tags[7].text
return [category, subcategory, quality, language, user, user_url, torrent_link,
size, comments, times_completed, seeders, leechers]
@staticmethod
[docs] def parse_torrent_properties(table_datas):
"""
Static method that parses a given list of table data elements and using helper methods
`Parser.is_subcategory`, `Parser.is_quality`, `Parser.is_language`, collects torrent properties.
:param list lxml.HtmlElement table_datas: table_datas to parse
:return: identified category, subcategory, quality and languages.
:rtype: dict
"""
output = {'category': table_datas[0].text, 'subcategory': None, 'quality': None, 'language': None}
for i in range(1, len(table_datas)):
td = table_datas[i]
url = td.get('href')
params = Parser.get_params(url)
if Parser.is_subcategory(params) and not output['subcategory']:
output['subcategory'] = td.text
elif Parser.is_quality(params) and not output['quality']:
output['quality'] = td.text
elif Parser.is_language(params) and not output['language']:
output['language'] = td.text
return output
@staticmethod
[docs] def parse_torrent_link(table_data):
"""
Static method that parses list of table data, finds all anchor elements
and gets the torrent url. However the torrent url is usually hidden behind a fake spam ad url,
this is handled.
:param list lxml.HtmlElement table_data: table_data tag to parse
:return: torrent url from anchor (link) element
:rtype: str
"""
anchors = table_data.findall('./a')
link_tag = anchors[0] if len(anchors) < 2 else anchors[1]
return link_tag.get('href')
@staticmethod
[docs] def is_subcategory(params):
"""
Static method that given a dict of url parameters, casts parameters' subcategory value to int
and compares it to default search query value - Category.ALL. Which is also the default `ALL` search query value for all subcategories.
:param dict params: parameters to get subcategory value from
:return: if given parameters' subcategory is different from Category.ALL or not
:rtype: bool
"""
return Category.ALL.value != int(params['subcategory'])
@staticmethod
[docs] def is_quality(params):
"""
Static method that given a dict of url parameters, casts parameters' quality value to int
and compares it to default search query value - Quality.ALL.
:param dict params: parameters to get quality value from
:return: if given parameters' quality is different from Quality.ALL or not
:rtype: bool
"""
return Quality.ALL != int(params['quality'])
@staticmethod
[docs] def is_language(params):
"""
Static method that given a dict of url parameters, casts parameters' language value to int
and compares it to default search query value - Language.ALL.
:param dict params: parameters to get language value from
:return: if given parameters' language is different from Language.ALL or not
:rtype: bool
"""
return Language.ALL != int(params['language'])