Private GIT

Skip to content
Snippets Groups Projects
Commit 35470937 authored by Kfir Hadas's avatar Kfir Hadas Committed by miigotu
Browse files

Update Python dependencies (#3716)

* Remove MultipartPostHandler.py (unused)

* Remove sqliteshelf.py (unused)

* Update tornado to 4.5.1 (was 4.4.2)

Dependencies:
- certifi updated to 2017.4.17
- singledispatch is up-to-date @ 3.4.0.3
- backports_abc updated to 0.5
- backports.ssl-match-hostname updated to 3.5.0.1
(used by tornado/netutil.py, for python versions prior to 2.7.9)
- six is up-to-date @ 1.10.0

* Rename shutDown, Fix static file routes

(the routes from webserve.py were taking over first)

* Mako updated to 1.0.6

Dependencies:
- MarkupSafe updated to 1.0

* Update README.md

Add information

* Update BeautifulSoup4 to 4.5.3 (was 4.4.1)

* Update html5lib to 0.999999999 (was 0.999)

Dependencies:
- [NEW] webencodings [required: Any, installed: 0.5.1]
- six [required: Any, installed: 1.10.0]

* Revert "Change IMDB parser to html.parse."

This reverts commit 12896fc3f9de86a27b17cf38c7dc41c2f7ab9267.

Maybe updating html5lib resolved that issue.

* Update README.md

- bs4
- html5lib
parent 88506c12
Branches
No related tags found
No related merge requests found
Showing
with 1022 additions and 751 deletions
...@@ -456,7 +456,7 @@ class SickRage(object): ...@@ -456,7 +456,7 @@ class SickRage(object):
# shutdown web server # shutdown web server
if self.web_server: if self.web_server:
logger.log('Shutting down Tornado') logger.log('Shutting down Tornado')
self.web_server.shutDown() self.web_server.shutdown()
try: try:
self.web_server.join(10) self.web_server.join(10)
......
#!/usr/bin/python
####
# 06/2010 Nic Wolfe <nic@wolfeden.ca>
# 02/2006 Will Holcomb <wholcomb@gmail.com>
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
import urllib
import urllib2
import mimetools, mimetypes
import os, sys
# Controls how sequences are uncoded. If true, elements may be given multiple values by
# assigning a sequence.
doseq = 1
class MultipartPostHandler(urllib2.BaseHandler):
handler_order = urllib2.HTTPHandler.handler_order - 10 # needs to run first
def http_request(self, request):
data = request.get_data()
if data is not None and type(data) != str:
v_files = []
v_vars = []
try:
for(key, value) in data.items():
if type(value) in (file, list, tuple):
v_files.append((key, value))
else:
v_vars.append((key, value))
except TypeError:
systype, value, traceback = sys.exc_info()
raise TypeError, "not a valid non-string sequence or mapping object", traceback
if len(v_files) == 0:
data = urllib.urlencode(v_vars, doseq)
else:
boundary, data = MultipartPostHandler.multipart_encode(v_vars, v_files)
contenttype = 'multipart/form-data; boundary=%s' % boundary
if(request.has_header('Content-Type')
and request.get_header('Content-Type').find('multipart/form-data') != 0):
print "Replacing %s with %s" % (request.get_header('content-type'), 'multipart/form-data')
request.add_unredirected_header('Content-Type', contenttype)
request.add_data(data)
return request
@staticmethod
def multipart_encode(vars, files, boundary = None, buffer = None):
if boundary is None:
boundary = mimetools.choose_boundary()
if buffer is None:
buffer = ''
for(key, value) in vars:
buffer += '--%s\r\n' % boundary
buffer += 'Content-Disposition: form-data; name="%s"' % key
buffer += '\r\n\r\n' + value + '\r\n'
for(key, fd) in files:
# allow them to pass in a file or a tuple with name & data
if type(fd) == file:
name_in = fd.name
fd.seek(0)
data_in = fd.read()
elif type(fd) in (tuple, list):
name_in, data_in = fd
filename = os.path.basename(name_in)
contenttype = mimetypes.guess_type(filename)[0] or 'application/octet-stream'
buffer += '--%s\r\n' % boundary
buffer += 'Content-Disposition: form-data; name="%s"; filename="%s"\r\n' % (key, filename)
buffer += 'Content-Type: %s\r\n' % contenttype
# buffer += 'Content-Length: %s\r\n' % file_size
buffer += '\r\n' + data_in + '\r\n'
buffer += '--%s--\r\n\r\n' % boundary
return boundary, buffer
https_request = http_request
Libraries directory
======================
Vendored python packages and custom libraries go in this folder.
Keep this list updated with installed versions and their dependencies,<br/>
and ordered by the top-level library name.
Adding a new package
---------
The best practice is to install the package within a Python **virtual environment** (using `virtualenv`),<br/>
then use `pipdeptree -p PACKAGE` to get a list of the package (+dependencies) versions.<br/>
Add the output to the list below to the appropriate location (based on the top-level package name)
***
Packages List
=========
```
beautifulsoup4==4.5.3
html5lib==0.999999999
- six [required: Any, installed: 1.10.0]
- webencodings [required: Any, installed: 0.5.1]
Mako==1.0.6
- MarkupSafe [required: >=0.9.2, installed: 1.0]
tornado==4.5.1
- backports-abc [required: >=0.4, installed: 0.5]
- backports.ssl-match-hostname [required: Any, installed: 3.5.0.1]
- certifi [required: Any, installed: 2017.4.17]
- singledispatch [required: Any, installed: 3.4.0.3]
- six [required: Any, installed: 1.10.0]
```
"""The match_hostname() function from Python 3.3.3, essential when using SSL.""" """The match_hostname() function from Python 3.3.3, essential when using SSL."""
import re import re
import sys
# ipaddress has been backported to 2.6+ in pypi. If it is installed on the
# system, use it to handle IPAddress ServerAltnames (this was added in
# python-3.5) otherwise only do DNS matching. This allows
# backports.ssl_match_hostname to continue to be used all the way back to
# python-2.4.
try:
import ipaddress
except ImportError:
ipaddress = None
__version__ = '3.5.0.1'
__version__ = '3.4.0.2'
class CertificateError(ValueError): class CertificateError(ValueError):
pass pass
...@@ -61,6 +73,23 @@ def _dnsname_match(dn, hostname, max_wildcards=1): ...@@ -61,6 +73,23 @@ def _dnsname_match(dn, hostname, max_wildcards=1):
return pat.match(hostname) return pat.match(hostname)
def _to_unicode(obj):
if isinstance(obj, str) and sys.version_info < (3,):
obj = unicode(obj, encoding='ascii', errors='strict')
return obj
def _ipaddress_match(ipname, host_ip):
"""Exact matching of IP addresses.
RFC 6125 explicitly doesn't define an algorithm for this
(section 1.7.2 - "Out of Scope").
"""
# OpenSSL may add a trailing newline to a subjectAltName's IP address
# Divergence from upstream: ipaddress can't handle byte str
ip = ipaddress.ip_address(_to_unicode(ipname).rstrip())
return ip == host_ip
def match_hostname(cert, hostname): def match_hostname(cert, hostname):
"""Verify that *cert* (in decoded format as returned by """Verify that *cert* (in decoded format as returned by
SSLSocket.getpeercert()) matches the *hostname*. RFC 2818 and RFC 6125 SSLSocket.getpeercert()) matches the *hostname*. RFC 2818 and RFC 6125
...@@ -70,12 +99,35 @@ def match_hostname(cert, hostname): ...@@ -70,12 +99,35 @@ def match_hostname(cert, hostname):
returns nothing. returns nothing.
""" """
if not cert: if not cert:
raise ValueError("empty or no certificate") raise ValueError("empty or no certificate, match_hostname needs a "
"SSL socket or SSL context with either "
"CERT_OPTIONAL or CERT_REQUIRED")
try:
# Divergence from upstream: ipaddress can't handle byte str
host_ip = ipaddress.ip_address(_to_unicode(hostname))
except ValueError:
# Not an IP address (common case)
host_ip = None
except UnicodeError:
# Divergence from upstream: Have to deal with ipaddress not taking
# byte strings. addresses should be all ascii, so we consider it not
# an ipaddress in this case
host_ip = None
except AttributeError:
# Divergence from upstream: Make ipaddress library optional
if ipaddress is None:
host_ip = None
else:
raise
dnsnames = [] dnsnames = []
san = cert.get('subjectAltName', ()) san = cert.get('subjectAltName', ())
for key, value in san: for key, value in san:
if key == 'DNS': if key == 'DNS':
if _dnsname_match(value, hostname): if host_ip is None and _dnsname_match(value, hostname):
return
dnsnames.append(value)
elif key == 'IP Address':
if host_ip is not None and _ipaddress_match(value, host_ip):
return return
dnsnames.append(value) dnsnames.append(value)
if not dnsnames: if not dnsnames:
......
...@@ -21,6 +21,20 @@ except ImportError: ...@@ -21,6 +21,20 @@ except ImportError:
import collections as _collections_abc import collections as _collections_abc
def get_mro(cls):
try:
return cls.__mro__
except AttributeError:
return old_style_mro(cls)
def old_style_mro(cls):
yield cls
for base in cls.__bases__:
for c in old_style_mro(base):
yield c
def mk_gen(): def mk_gen():
from abc import abstractmethod from abc import abstractmethod
...@@ -63,7 +77,7 @@ def mk_gen(): ...@@ -63,7 +77,7 @@ def mk_gen():
@classmethod @classmethod
def __subclasshook__(cls, C): def __subclasshook__(cls, C):
if cls is Generator: if cls is Generator:
mro = C.__mro__ mro = get_mro(C)
for method in required_methods: for method in required_methods:
for base in mro: for base in mro:
if method in base.__dict__: if method in base.__dict__:
...@@ -88,7 +102,7 @@ def mk_awaitable(): ...@@ -88,7 +102,7 @@ def mk_awaitable():
@classmethod @classmethod
def __subclasshook__(cls, C): def __subclasshook__(cls, C):
if cls is Awaitable: if cls is Awaitable:
for B in C.__mro__: for B in get_mro(C):
if '__await__' in B.__dict__: if '__await__' in B.__dict__:
if B.__dict__['__await__']: if B.__dict__['__await__']:
return True return True
...@@ -144,7 +158,7 @@ def mk_coroutine(): ...@@ -144,7 +158,7 @@ def mk_coroutine():
@classmethod @classmethod
def __subclasshook__(cls, C): def __subclasshook__(cls, C):
if cls is Coroutine: if cls is Coroutine:
mro = C.__mro__ mro = get_mro(C)
for method in ('__await__', 'send', 'throw', 'close'): for method in ('__await__', 'send', 'throw', 'close'):
for base in mro: for base in mro:
if method in base.__dict__: if method in base.__dict__:
......
...@@ -5,26 +5,31 @@ http://www.crummy.com/software/BeautifulSoup/ ...@@ -5,26 +5,31 @@ http://www.crummy.com/software/BeautifulSoup/
Beautiful Soup uses a pluggable XML or HTML parser to parse a Beautiful Soup uses a pluggable XML or HTML parser to parse a
(possibly invalid) document into a tree representation. Beautiful Soup (possibly invalid) document into a tree representation. Beautiful Soup
provides provides methods and Pythonic idioms that make it easy to provides methods and Pythonic idioms that make it easy to navigate,
navigate, search, and modify the parse tree. search, and modify the parse tree.
Beautiful Soup works with Python 2.6 and up. It works better if lxml Beautiful Soup works with Python 2.7 and up. It works better if lxml
and/or html5lib is installed. and/or html5lib is installed.
For more than you ever wanted to know about Beautiful Soup, see the For more than you ever wanted to know about Beautiful Soup, see the
documentation: documentation:
http://www.crummy.com/software/BeautifulSoup/bs4/doc/ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
""" """
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
__author__ = "Leonard Richardson (leonardr@segfault.org)" __author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.4.1" __version__ = "4.5.3"
__copyright__ = "Copyright (c) 2004-2015 Leonard Richardson" __copyright__ = "Copyright (c) 2004-2017 Leonard Richardson"
__license__ = "MIT" __license__ = "MIT"
__all__ = ['BeautifulSoup'] __all__ = ['BeautifulSoup']
import os import os
import re import re
import traceback
import warnings import warnings
from .builder import builder_registry, ParserRejectedMarkup from .builder import builder_registry, ParserRejectedMarkup
...@@ -77,7 +82,7 @@ class BeautifulSoup(Tag): ...@@ -77,7 +82,7 @@ class BeautifulSoup(Tag):
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n" NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
def __init__(self, markup="", features=None, builder=None, def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, exclude_encodings=None, parse_only=None, from_encoding=None, exclude_encodings=None,
...@@ -137,6 +142,10 @@ class BeautifulSoup(Tag): ...@@ -137,6 +142,10 @@ class BeautifulSoup(Tag):
from_encoding = from_encoding or deprecated_argument( from_encoding = from_encoding or deprecated_argument(
"fromEncoding", "from_encoding") "fromEncoding", "from_encoding")
if from_encoding and isinstance(markup, unicode):
warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
from_encoding = None
if len(kwargs) > 0: if len(kwargs) > 0:
arg = kwargs.keys().pop() arg = kwargs.keys().pop()
raise TypeError( raise TypeError(
...@@ -161,19 +170,29 @@ class BeautifulSoup(Tag): ...@@ -161,19 +170,29 @@ class BeautifulSoup(Tag):
markup_type = "XML" markup_type = "XML"
else: else:
markup_type = "HTML" markup_type = "HTML"
caller = traceback.extract_stack()[0]
filename = caller[0]
line_number = caller[1]
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict( warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
filename=filename,
line_number=line_number,
parser=builder.NAME, parser=builder.NAME,
markup_type=markup_type)) markup_type=markup_type))
self.builder = builder self.builder = builder
self.is_xml = builder.is_xml self.is_xml = builder.is_xml
self.known_xml = self.is_xml
self.builder.soup = self self.builder.soup = self
self.parse_only = parse_only self.parse_only = parse_only
if hasattr(markup, 'read'): # It's a file-type object. if hasattr(markup, 'read'): # It's a file-type object.
markup = markup.read() markup = markup.read()
elif len(markup) <= 256: elif len(markup) <= 256 and (
(isinstance(markup, bytes) and not b'<' in markup)
or (isinstance(markup, unicode) and not u'<' in markup)
):
# Print out warnings for a couple beginner problems # Print out warnings for a couple beginner problems
# involving passing non-markup to Beautiful Soup. # involving passing non-markup to Beautiful Soup.
# Beautiful Soup will still parse the input as markup, # Beautiful Soup will still parse the input as markup,
...@@ -195,16 +214,10 @@ class BeautifulSoup(Tag): ...@@ -195,16 +214,10 @@ class BeautifulSoup(Tag):
if isinstance(markup, unicode): if isinstance(markup, unicode):
markup = markup.encode("utf8") markup = markup.encode("utf8")
warnings.warn( warnings.warn(
'"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) '"%s" looks like a filename, not markup. You should'
if markup[:5] == "http:" or markup[:6] == "https:": 'probably open this file and pass the filehandle into'
# TODO: This is ugly but I couldn't get it to work in 'Beautiful Soup.' % markup)
# Python 3 otherwise. self._check_markup_is_url(markup)
if ((isinstance(markup, bytes) and not b' ' in markup)
or (isinstance(markup, unicode) and not u' ' in markup)):
if isinstance(markup, unicode):
markup = markup.encode("utf8")
warnings.warn(
'"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
for (self.markup, self.original_encoding, self.declared_html_encoding, for (self.markup, self.original_encoding, self.declared_html_encoding,
self.contains_replacement_characters) in ( self.contains_replacement_characters) in (
...@@ -223,15 +236,52 @@ class BeautifulSoup(Tag): ...@@ -223,15 +236,52 @@ class BeautifulSoup(Tag):
self.builder.soup = None self.builder.soup = None
def __copy__(self): def __copy__(self):
return type(self)(self.encode(), builder=self.builder) copy = type(self)(
self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
)
# Although we encoded the tree to UTF-8, that may not have
# been the encoding of the original markup. Set the copy's
# .original_encoding to reflect the original object's
# .original_encoding.
copy.original_encoding = self.original_encoding
return copy
def __getstate__(self): def __getstate__(self):
# Frequently a tree builder can't be pickled. # Frequently a tree builder can't be pickled.
d = dict(self.__dict__) d = dict(self.__dict__)
if 'builder' in d and not self.builder.picklable: if 'builder' in d and not self.builder.picklable:
del d['builder'] d['builder'] = None
return d return d
@staticmethod
def _check_markup_is_url(markup):
"""
Check if markup looks like it's actually a url and raise a warning
if so. Markup can be unicode or str (py2) / bytes (py3).
"""
if isinstance(markup, bytes):
space = b' '
cant_start_with = (b"http:", b"https:")
elif isinstance(markup, unicode):
space = u' '
cant_start_with = (u"http:", u"https:")
else:
return
if any(markup.startswith(prefix) for prefix in cant_start_with):
if not space in markup:
if isinstance(markup, bytes):
decoded_markup = markup.decode('utf-8', 'replace')
else:
decoded_markup = markup
warnings.warn(
'"%s" looks like a URL. Beautiful Soup is not an'
' HTTP client. You should probably use an HTTP client like'
' requests to get the document behind the URL, and feed'
' that document to Beautiful Soup.' % decoded_markup
)
def _feed(self): def _feed(self):
# Convert the document to Unicode. # Convert the document to Unicode.
self.builder.reset() self.builder.reset()
...@@ -335,7 +385,18 @@ class BeautifulSoup(Tag): ...@@ -335,7 +385,18 @@ class BeautifulSoup(Tag):
if parent.next_sibling: if parent.next_sibling:
# This node is being inserted into an element that has # This node is being inserted into an element that has
# already been parsed. Deal with any dangling references. # already been parsed. Deal with any dangling references.
index = parent.contents.index(o) index = len(parent.contents)-1
while index >= 0:
if parent.contents[index] is o:
break
index -= 1
else:
raise ValueError(
"Error building tree: supposedly %r was inserted "
"into %r after the fact, but I don't see it!" % (
o, parent
)
)
if index == 0: if index == 0:
previous_element = parent previous_element = parent
previous_sibling = None previous_sibling = None
...@@ -387,7 +448,7 @@ class BeautifulSoup(Tag): ...@@ -387,7 +448,7 @@ class BeautifulSoup(Tag):
"""Push a start tag on to the stack. """Push a start tag on to the stack.
If this method returns None, the tag was rejected by the If this method returns None, the tag was rejected by the
SoupStrainer. You should proceed as if the tag had not occured SoupStrainer. You should proceed as if the tag had not occurred
in the document. For instance, if this was a self-closing tag, in the document. For instance, if this was a self-closing tag,
don't call handle_endtag. don't call handle_endtag.
""" """
......
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
from collections import defaultdict from collections import defaultdict
import itertools import itertools
import sys import sys
from bs4.element import ( from bs4.element import (
CharsetMetaAttributeValue, CharsetMetaAttributeValue,
ContentMetaAttributeValue, ContentMetaAttributeValue,
HTMLAwareEntitySubstitution,
whitespace_re whitespace_re
) )
...@@ -227,7 +231,7 @@ class HTMLTreeBuilder(TreeBuilder): ...@@ -227,7 +231,7 @@ class HTMLTreeBuilder(TreeBuilder):
Such as which tags are empty-element tags. Such as which tags are empty-element tags.
""" """
preserve_whitespace_tags = set(['pre', 'textarea']) preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
'spacer', 'link', 'frame', 'base']) 'spacer', 'link', 'frame', 'base'])
......
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
__all__ = [ __all__ = [
'HTML5TreeBuilder', 'HTML5TreeBuilder',
] ]
from pdb import set_trace
import warnings import warnings
import re
from bs4.builder import ( from bs4.builder import (
PERMISSIVE, PERMISSIVE,
HTML, HTML,
...@@ -15,7 +18,10 @@ from bs4.element import ( ...@@ -15,7 +18,10 @@ from bs4.element import (
whitespace_re, whitespace_re,
) )
import html5lib import html5lib
from html5lib.constants import namespaces from html5lib.constants import (
namespaces,
prefixes,
)
from bs4.element import ( from bs4.element import (
Comment, Comment,
Doctype, Doctype,
...@@ -23,6 +29,15 @@ from bs4.element import ( ...@@ -23,6 +29,15 @@ from bs4.element import (
Tag, Tag,
) )
try:
# Pre-0.99999999
from html5lib.treebuilders import _base as treebuilder_base
new_html5lib = False
except ImportError, e:
# 0.99999999 and up
from html5lib.treebuilders import base as treebuilder_base
new_html5lib = True
class HTML5TreeBuilder(HTMLTreeBuilder): class HTML5TreeBuilder(HTMLTreeBuilder):
"""Use html5lib to build a tree.""" """Use html5lib to build a tree."""
...@@ -47,7 +62,14 @@ class HTML5TreeBuilder(HTMLTreeBuilder): ...@@ -47,7 +62,14 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
if self.soup.parse_only is not None: if self.soup.parse_only is not None:
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
parser = html5lib.HTMLParser(tree=self.create_treebuilder) parser = html5lib.HTMLParser(tree=self.create_treebuilder)
doc = parser.parse(markup, encoding=self.user_specified_encoding)
extra_kwargs = dict()
if not isinstance(markup, unicode):
if new_html5lib:
extra_kwargs['override_encoding'] = self.user_specified_encoding
else:
extra_kwargs['encoding'] = self.user_specified_encoding
doc = parser.parse(markup, **extra_kwargs)
# Set the character encoding detected by the tokenizer. # Set the character encoding detected by the tokenizer.
if isinstance(markup, unicode): if isinstance(markup, unicode):
...@@ -55,11 +77,17 @@ class HTML5TreeBuilder(HTMLTreeBuilder): ...@@ -55,11 +77,17 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
# charEncoding to UTF-8 if it gets Unicode input. # charEncoding to UTF-8 if it gets Unicode input.
doc.original_encoding = None doc.original_encoding = None
else: else:
doc.original_encoding = parser.tokenizer.stream.charEncoding[0] original_encoding = parser.tokenizer.stream.charEncoding[0]
if not isinstance(original_encoding, basestring):
# In 0.99999999 and up, the encoding is an html5lib
# Encoding object. We want to use a string for compatibility
# with other tree builders.
original_encoding = original_encoding.name
doc.original_encoding = original_encoding
def create_treebuilder(self, namespaceHTMLElements): def create_treebuilder(self, namespaceHTMLElements):
self.underlying_builder = TreeBuilderForHtml5lib( self.underlying_builder = TreeBuilderForHtml5lib(
self.soup, namespaceHTMLElements) namespaceHTMLElements, self.soup)
return self.underlying_builder return self.underlying_builder
def test_fragment_to_document(self, fragment): def test_fragment_to_document(self, fragment):
...@@ -67,10 +95,14 @@ class HTML5TreeBuilder(HTMLTreeBuilder): ...@@ -67,10 +95,14 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
return u'<html><head></head><body>%s</body></html>' % fragment return u'<html><head></head><body>%s</body></html>' % fragment
class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
def __init__(self, soup, namespaceHTMLElements): def __init__(self, namespaceHTMLElements, soup=None):
if soup:
self.soup = soup self.soup = soup
else:
from bs4 import BeautifulSoup
self.soup = BeautifulSoup("", "html.parser")
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
def documentClass(self): def documentClass(self):
...@@ -93,7 +125,8 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): ...@@ -93,7 +125,8 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
return TextNode(Comment(data), self.soup) return TextNode(Comment(data), self.soup)
def fragmentClass(self): def fragmentClass(self):
self.soup = BeautifulSoup("") from bs4 import BeautifulSoup
self.soup = BeautifulSoup("", "html.parser")
self.soup.name = "[document_fragment]" self.soup.name = "[document_fragment]"
return Element(self.soup, self.soup, None) return Element(self.soup, self.soup, None)
...@@ -105,7 +138,57 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): ...@@ -105,7 +138,57 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
return self.soup return self.soup
def getFragment(self): def getFragment(self):
return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element return treebuilder_base.TreeBuilder.getFragment(self).element
def testSerializer(self, element):
from bs4 import BeautifulSoup
rv = []
doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')
def serializeElement(element, indent=0):
if isinstance(element, BeautifulSoup):
pass
if isinstance(element, Doctype):
m = doctype_re.match(element)
if m:
name = m.group(1)
if m.lastindex > 1:
publicId = m.group(2) or ""
systemId = m.group(3) or m.group(4) or ""
rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
(' ' * indent, name, publicId, systemId))
else:
rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
else:
rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
elif isinstance(element, Comment):
rv.append("|%s<!-- %s -->" % (' ' * indent, element))
elif isinstance(element, NavigableString):
rv.append("|%s\"%s\"" % (' ' * indent, element))
else:
if element.namespace:
name = "%s %s" % (prefixes[element.namespace],
element.name)
else:
name = element.name
rv.append("|%s<%s>" % (' ' * indent, name))
if element.attrs:
attributes = []
for name, value in element.attrs.items():
if isinstance(name, NamespacedAttribute):
name = "%s %s" % (prefixes[name.namespace], name.name)
if isinstance(value, list):
value = " ".join(value)
attributes.append((name, value))
for name, value in sorted(attributes):
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
indent += 2
for child in element.children:
serializeElement(child, indent)
serializeElement(element, 0)
return "\n".join(rv)
class AttrList(object): class AttrList(object):
def __init__(self, element): def __init__(self, element):
...@@ -137,9 +220,9 @@ class AttrList(object): ...@@ -137,9 +220,9 @@ class AttrList(object):
return name in list(self.attrs.keys()) return name in list(self.attrs.keys())
class Element(html5lib.treebuilders._base.Node): class Element(treebuilder_base.Node):
def __init__(self, element, soup, namespace): def __init__(self, element, soup, namespace):
html5lib.treebuilders._base.Node.__init__(self, element.name) treebuilder_base.Node.__init__(self, element.name)
self.element = element self.element = element
self.soup = soup self.soup = soup
self.namespace = namespace self.namespace = namespace
...@@ -158,8 +241,10 @@ class Element(html5lib.treebuilders._base.Node): ...@@ -158,8 +241,10 @@ class Element(html5lib.treebuilders._base.Node):
child = node child = node
elif node.element.__class__ == NavigableString: elif node.element.__class__ == NavigableString:
string_child = child = node.element string_child = child = node.element
node.parent = self
else: else:
child = node.element child = node.element
node.parent = self
if not isinstance(child, basestring) and child.parent is not None: if not isinstance(child, basestring) and child.parent is not None:
node.element.extract() node.element.extract()
...@@ -197,6 +282,8 @@ class Element(html5lib.treebuilders._base.Node): ...@@ -197,6 +282,8 @@ class Element(html5lib.treebuilders._base.Node):
most_recent_element=most_recent_element) most_recent_element=most_recent_element)
def getAttributes(self): def getAttributes(self):
if isinstance(self.element, Comment):
return {}
return AttrList(self.element) return AttrList(self.element)
def setAttributes(self, attributes): def setAttributes(self, attributes):
...@@ -224,11 +311,11 @@ class Element(html5lib.treebuilders._base.Node): ...@@ -224,11 +311,11 @@ class Element(html5lib.treebuilders._base.Node):
attributes = property(getAttributes, setAttributes) attributes = property(getAttributes, setAttributes)
def insertText(self, data, insertBefore=None): def insertText(self, data, insertBefore=None):
if insertBefore:
text = TextNode(self.soup.new_string(data), self.soup) text = TextNode(self.soup.new_string(data), self.soup)
self.insertBefore(data, insertBefore) if insertBefore:
self.insertBefore(text, insertBefore)
else: else:
self.appendChild(data) self.appendChild(text)
def insertBefore(self, node, refNode): def insertBefore(self, node, refNode):
index = self.element.index(refNode.element) index = self.element.index(refNode.element)
...@@ -250,6 +337,7 @@ class Element(html5lib.treebuilders._base.Node): ...@@ -250,6 +337,7 @@ class Element(html5lib.treebuilders._base.Node):
# print "MOVE", self.element.contents # print "MOVE", self.element.contents
# print "FROM", self.element # print "FROM", self.element
# print "TO", new_parent.element # print "TO", new_parent.element
element = self.element element = self.element
new_parent_element = new_parent.element new_parent_element = new_parent.element
# Determine what this tag's next_element will be once all the children # Determine what this tag's next_element will be once all the children
...@@ -268,7 +356,6 @@ class Element(html5lib.treebuilders._base.Node): ...@@ -268,7 +356,6 @@ class Element(html5lib.treebuilders._base.Node):
new_parents_last_descendant_next_element = new_parent_element.next_element new_parents_last_descendant_next_element = new_parent_element.next_element
to_append = element.contents to_append = element.contents
append_after = new_parent_element.contents
if len(to_append) > 0: if len(to_append) > 0:
# Set the first child's previous_element and previous_sibling # Set the first child's previous_element and previous_sibling
# to elements within the new parent # to elements within the new parent
...@@ -285,12 +372,19 @@ class Element(html5lib.treebuilders._base.Node): ...@@ -285,12 +372,19 @@ class Element(html5lib.treebuilders._base.Node):
if new_parents_last_child: if new_parents_last_child:
new_parents_last_child.next_sibling = first_child new_parents_last_child.next_sibling = first_child
# Fix the last child's next_element and next_sibling # Find the very last element being moved. It is now the
last_child = to_append[-1] # parent's last descendant. It has no .next_sibling and
last_child.next_element = new_parents_last_descendant_next_element # its .next_element is whatever the previous last
# descendant had.
last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
if new_parents_last_descendant_next_element: if new_parents_last_descendant_next_element:
new_parents_last_descendant_next_element.previous_element = last_child # TODO: This code has no test coverage and I'm not sure
last_child.next_sibling = None # how to get html5lib to go through this path, but it's
# just the other side of the previous line.
new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant
last_childs_last_descendant.next_sibling = None
for child in to_append: for child in to_append:
child.parent = new_parent_element child.parent = new_parent_element
...@@ -324,7 +418,7 @@ class Element(html5lib.treebuilders._base.Node): ...@@ -324,7 +418,7 @@ class Element(html5lib.treebuilders._base.Node):
class TextNode(Element): class TextNode(Element):
def __init__(self, element, soup): def __init__(self, element, soup):
html5lib.treebuilders._base.Node.__init__(self, None) treebuilder_base.Node.__init__(self, None)
self.element = element self.element = element
self.soup = soup self.soup = soup
......
"""Use the HTMLParser library to parse HTML files that aren't too bad.""" """Use the HTMLParser library to parse HTML files that aren't too bad."""
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
__all__ = [ __all__ = [
'HTMLParserTreeBuilder', 'HTMLParserTreeBuilder',
] ]
......
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
__all__ = [ __all__ = [
'LXMLTreeBuilderForXML', 'LXMLTreeBuilderForXML',
'LXMLTreeBuilder', 'LXMLTreeBuilder',
...@@ -12,6 +14,7 @@ from bs4.element import ( ...@@ -12,6 +14,7 @@ from bs4.element import (
Doctype, Doctype,
NamespacedAttribute, NamespacedAttribute,
ProcessingInstruction, ProcessingInstruction,
XMLProcessingInstruction,
) )
from bs4.builder import ( from bs4.builder import (
FAST, FAST,
...@@ -29,6 +32,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): ...@@ -29,6 +32,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
DEFAULT_PARSER_CLASS = etree.XMLParser DEFAULT_PARSER_CLASS = etree.XMLParser
is_xml = True is_xml = True
processing_instruction_class = XMLProcessingInstruction
NAME = "lxml-xml" NAME = "lxml-xml"
ALTERNATE_NAMES = ["xml"] ALTERNATE_NAMES = ["xml"]
...@@ -87,6 +91,16 @@ class LXMLTreeBuilderForXML(TreeBuilder): ...@@ -87,6 +91,16 @@ class LXMLTreeBuilderForXML(TreeBuilder):
Each 4-tuple represents a strategy for parsing the document. Each 4-tuple represents a strategy for parsing the document.
""" """
# Instead of using UnicodeDammit to convert the bytestring to
# Unicode using different encodings, use EncodingDetector to
# iterate over the encodings, and tell lxml to try to parse
# the document as each one in turn.
is_html = not self.is_xml
if is_html:
self.processing_instruction_class = ProcessingInstruction
else:
self.processing_instruction_class = XMLProcessingInstruction
if isinstance(markup, unicode): if isinstance(markup, unicode):
# We were given Unicode. Maybe lxml can parse Unicode on # We were given Unicode. Maybe lxml can parse Unicode on
# this system? # this system?
...@@ -98,11 +112,6 @@ class LXMLTreeBuilderForXML(TreeBuilder): ...@@ -98,11 +112,6 @@ class LXMLTreeBuilderForXML(TreeBuilder):
yield (markup.encode("utf8"), "utf8", yield (markup.encode("utf8"), "utf8",
document_declared_encoding, False) document_declared_encoding, False)
# Instead of using UnicodeDammit to convert the bytestring to
# Unicode using different encodings, use EncodingDetector to
# iterate over the encodings, and tell lxml to try to parse
# the document as each one in turn.
is_html = not self.is_xml
try_encodings = [user_specified_encoding, document_declared_encoding] try_encodings = [user_specified_encoding, document_declared_encoding]
detector = EncodingDetector( detector = EncodingDetector(
markup, try_encodings, is_html, exclude_encodings) markup, try_encodings, is_html, exclude_encodings)
...@@ -201,7 +210,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): ...@@ -201,7 +210,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def pi(self, target, data): def pi(self, target, data):
self.soup.endData() self.soup.endData()
self.soup.handle_data(target + ' ' + data) self.soup.handle_data(target + ' ' + data)
self.soup.endData(ProcessingInstruction) self.soup.endData(self.processing_instruction_class)
def data(self, content): def data(self, content):
self.soup.handle_data(content) self.soup.handle_data(content)
...@@ -229,6 +238,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): ...@@ -229,6 +238,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
is_xml = False is_xml = False
processing_instruction_class = ProcessingInstruction
def default_parser(self, encoding): def default_parser(self, encoding):
return etree.HTMLParser return etree.HTMLParser
......
...@@ -6,9 +6,10 @@ necessary. It is heavily based on code from Mark Pilgrim's Universal ...@@ -6,9 +6,10 @@ necessary. It is heavily based on code from Mark Pilgrim's Universal
Feed Parser. It works best on XML and HTML, but it does not rewrite the Feed Parser. It works best on XML and HTML, but it does not rewrite the
XML or HTML to reflect a new encoding; that's the tree builder's job. XML or HTML to reflect a new encoding; that's the tree builder's job.
""" """
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
__license__ = "MIT" __license__ = "MIT"
from pdb import set_trace
import codecs import codecs
from htmlentitydefs import codepoint2name from htmlentitydefs import codepoint2name
import re import re
...@@ -346,7 +347,7 @@ class UnicodeDammit: ...@@ -346,7 +347,7 @@ class UnicodeDammit:
self.tried_encodings = [] self.tried_encodings = []
self.contains_replacement_characters = False self.contains_replacement_characters = False
self.is_html = is_html self.is_html = is_html
self.log = logging.getLogger(__name__)
self.detector = EncodingDetector( self.detector = EncodingDetector(
markup, override_encodings, is_html, exclude_encodings) markup, override_encodings, is_html, exclude_encodings)
...@@ -376,9 +377,10 @@ class UnicodeDammit: ...@@ -376,9 +377,10 @@ class UnicodeDammit:
if encoding != "ascii": if encoding != "ascii":
u = self._convert_from(encoding, "replace") u = self._convert_from(encoding, "replace")
if u is not None: if u is not None:
logging.warning( self.log.warning(
"Some characters could not be decoded, and were " "Some characters could not be decoded, and were "
"replaced with REPLACEMENT CHARACTER.") "replaced with REPLACEMENT CHARACTER."
)
self.contains_replacement_characters = True self.contains_replacement_characters = True
break break
......
"""Diagnostic functions, mainly for use when doing tech support.""" """Diagnostic functions, mainly for use when doing tech support."""
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
__license__ = "MIT" __license__ = "MIT"
import cProfile import cProfile
...@@ -56,7 +58,8 @@ def diagnose(data): ...@@ -56,7 +58,8 @@ def diagnose(data):
data = data.read() data = data.read()
elif os.path.exists(data): elif os.path.exists(data):
print '"%s" looks like a filename. Reading data from the file.' % data print '"%s" looks like a filename. Reading data from the file.' % data
data = open(data).read() with open(data) as fp:
data = fp.read()
elif data.startswith("http:") or data.startswith("https:"): elif data.startswith("http:") or data.startswith("https:"):
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
......
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
__license__ = "MIT" __license__ = "MIT"
from pdb import set_trace
import collections import collections
import re import re
import shlex
import sys import sys
import warnings import warnings
from bs4.dammit import EntitySubstitution from bs4.dammit import EntitySubstitution
...@@ -99,6 +101,8 @@ class HTMLAwareEntitySubstitution(EntitySubstitution): ...@@ -99,6 +101,8 @@ class HTMLAwareEntitySubstitution(EntitySubstitution):
preformatted_tags = set(["pre"]) preformatted_tags = set(["pre"])
preserve_whitespace_tags = set(['pre', 'textarea'])
@classmethod @classmethod
def _substitute_if_appropriate(cls, ns, f): def _substitute_if_appropriate(cls, ns, f):
if (isinstance(ns, NavigableString) if (isinstance(ns, NavigableString)
...@@ -169,11 +173,19 @@ class PageElement(object): ...@@ -169,11 +173,19 @@ class PageElement(object):
This is used when mapping a formatter name ("minimal") to an This is used when mapping a formatter name ("minimal") to an
appropriate function (one that performs entity-substitution on appropriate function (one that performs entity-substitution on
the contents of <script> and <style> tags, or not). It's the contents of <script> and <style> tags, or not). It can be
inefficient, but it should be called very rarely. inefficient, but it should be called very rarely.
""" """
if self.known_xml is not None:
# Most of the time we will have determined this when the
# document is parsed.
return self.known_xml
# Otherwise, it's likely that this element was created by
# direct invocation of the constructor from within the user's
# Python code.
if self.parent is None: if self.parent is None:
# This is the top-level object. It should have .is_xml set # This is the top-level object. It should have .known_xml set
# from tree creation. If not, take a guess--BS is usually # from tree creation. If not, take a guess--BS is usually
# used on HTML markup. # used on HTML markup.
return getattr(self, 'is_xml', False) return getattr(self, 'is_xml', False)
...@@ -637,7 +649,7 @@ class PageElement(object): ...@@ -637,7 +649,7 @@ class PageElement(object):
return lambda el: el._attr_value_as_string( return lambda el: el._attr_value_as_string(
attribute, '').startswith(value) attribute, '').startswith(value)
elif operator == '$': elif operator == '$':
# string represenation of `attribute` ends with `value` # string representation of `attribute` ends with `value`
return lambda el: el._attr_value_as_string( return lambda el: el._attr_value_as_string(
attribute, '').endswith(value) attribute, '').endswith(value)
elif operator == '*': elif operator == '*':
...@@ -677,6 +689,11 @@ class NavigableString(unicode, PageElement): ...@@ -677,6 +689,11 @@ class NavigableString(unicode, PageElement):
PREFIX = '' PREFIX = ''
SUFFIX = '' SUFFIX = ''
# We can't tell just by looking at a string whether it's contained
# in an XML document or an HTML document.
known_xml = None
def __new__(cls, value): def __new__(cls, value):
"""Create a new NavigableString. """Create a new NavigableString.
...@@ -743,10 +760,16 @@ class CData(PreformattedString): ...@@ -743,10 +760,16 @@ class CData(PreformattedString):
SUFFIX = u']]>' SUFFIX = u']]>'
class ProcessingInstruction(PreformattedString): class ProcessingInstruction(PreformattedString):
"""A SGML processing instruction."""
PREFIX = u'<?' PREFIX = u'<?'
SUFFIX = u'>' SUFFIX = u'>'
class XMLProcessingInstruction(ProcessingInstruction):
"""An XML processing instruction."""
PREFIX = u'<?'
SUFFIX = u'?>'
class Comment(PreformattedString): class Comment(PreformattedString):
PREFIX = u'<!--' PREFIX = u'<!--'
...@@ -781,7 +804,8 @@ class Tag(PageElement): ...@@ -781,7 +804,8 @@ class Tag(PageElement):
"""Represents a found HTML tag with its attributes and contents.""" """Represents a found HTML tag with its attributes and contents."""
def __init__(self, parser=None, builder=None, name=None, namespace=None, def __init__(self, parser=None, builder=None, name=None, namespace=None,
prefix=None, attrs=None, parent=None, previous=None): prefix=None, attrs=None, parent=None, previous=None,
is_xml=None):
"Basic constructor." "Basic constructor."
if parser is None: if parser is None:
...@@ -795,6 +819,14 @@ class Tag(PageElement): ...@@ -795,6 +819,14 @@ class Tag(PageElement):
self.name = name self.name = name
self.namespace = namespace self.namespace = namespace
self.prefix = prefix self.prefix = prefix
if builder is not None:
preserve_whitespace_tags = builder.preserve_whitespace_tags
else:
if is_xml:
preserve_whitespace_tags = []
else:
preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
self.preserve_whitespace_tags = preserve_whitespace_tags
if attrs is None: if attrs is None:
attrs = {} attrs = {}
elif attrs: elif attrs:
...@@ -805,6 +837,13 @@ class Tag(PageElement): ...@@ -805,6 +837,13 @@ class Tag(PageElement):
attrs = dict(attrs) attrs = dict(attrs)
else: else:
attrs = dict(attrs) attrs = dict(attrs)
# If possible, determine ahead of time whether this tag is an
# XML tag.
if builder:
self.known_xml = builder.is_xml
else:
self.known_xml = is_xml
self.attrs = attrs self.attrs = attrs
self.contents = [] self.contents = []
self.setup(parent, previous) self.setup(parent, previous)
...@@ -824,7 +863,7 @@ class Tag(PageElement): ...@@ -824,7 +863,7 @@ class Tag(PageElement):
Its contents are a copy of the old Tag's contents. Its contents are a copy of the old Tag's contents.
""" """
clone = type(self)(None, self.builder, self.name, self.namespace, clone = type(self)(None, self.builder, self.name, self.namespace,
self.nsprefix, self.attrs) self.nsprefix, self.attrs, is_xml=self._is_xml)
for attr in ('can_be_empty_element', 'hidden'): for attr in ('can_be_empty_element', 'hidden'):
setattr(clone, attr, getattr(self, attr)) setattr(clone, attr, getattr(self, attr))
for child in self.contents: for child in self.contents:
...@@ -1057,10 +1096,11 @@ class Tag(PageElement): ...@@ -1057,10 +1096,11 @@ class Tag(PageElement):
def _should_pretty_print(self, indent_level): def _should_pretty_print(self, indent_level):
"""Should this tag be pretty-printed?""" """Should this tag be pretty-printed?"""
return ( return (
indent_level is not None and indent_level is not None
(self.name not in HTMLAwareEntitySubstitution.preformatted_tags and self.name not in self.preserve_whitespace_tags
or self._is_xml)) )
def decode(self, indent_level=None, def decode(self, indent_level=None,
eventual_encoding=DEFAULT_OUTPUT_ENCODING, eventual_encoding=DEFAULT_OUTPUT_ENCODING,
...@@ -1280,6 +1320,7 @@ class Tag(PageElement): ...@@ -1280,6 +1320,7 @@ class Tag(PageElement):
_selector_combinators = ['>', '+', '~'] _selector_combinators = ['>', '+', '~']
_select_debug = False _select_debug = False
quoted_colon = re.compile('"[^"]*:[^"]*"')
def select_one(self, selector): def select_one(self, selector):
"""Perform a CSS selection operation on the current element.""" """Perform a CSS selection operation on the current element."""
value = self.select(selector, limit=1) value = self.select(selector, limit=1)
...@@ -1305,8 +1346,7 @@ class Tag(PageElement): ...@@ -1305,8 +1346,7 @@ class Tag(PageElement):
if limit and len(context) >= limit: if limit and len(context) >= limit:
break break
return context return context
tokens = shlex.split(selector)
tokens = selector.split()
current_context = [self] current_context = [self]
if tokens[-1] in self._selector_combinators: if tokens[-1] in self._selector_combinators:
...@@ -1358,7 +1398,7 @@ class Tag(PageElement): ...@@ -1358,7 +1398,7 @@ class Tag(PageElement):
return classes.issubset(candidate.get('class', [])) return classes.issubset(candidate.get('class', []))
checker = classes_match checker = classes_match
elif ':' in token: elif ':' in token and not self.quoted_colon.search(token):
# Pseudo-class # Pseudo-class
tag_name, pseudo = token.split(':', 1) tag_name, pseudo = token.split(':', 1)
if tag_name == '': if tag_name == '':
...@@ -1389,10 +1429,7 @@ class Tag(PageElement): ...@@ -1389,10 +1429,7 @@ class Tag(PageElement):
self.count += 1 self.count += 1
if self.count == self.destination: if self.count == self.destination:
return True return True
if self.count > self.destination: else:
# Stop the generator that's sending us
# these things.
raise StopIteration()
return False return False
checker = Counter(pseudo_value).nth_child_of_type checker = Counter(pseudo_value).nth_child_of_type
else: else:
...@@ -1498,13 +1535,12 @@ class Tag(PageElement): ...@@ -1498,13 +1535,12 @@ class Tag(PageElement):
# don't include it in the context more than once. # don't include it in the context more than once.
new_context.append(candidate) new_context.append(candidate)
new_context_ids.add(id(candidate)) new_context_ids.add(id(candidate))
if limit and len(new_context) >= limit:
break
elif self._select_debug: elif self._select_debug:
print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs)) print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
current_context = new_context current_context = new_context
if limit and len(current_context) >= limit:
current_context = current_context[:limit]
if self._select_debug: if self._select_debug:
print "Final verdict:" print "Final verdict:"
...@@ -1668,20 +1704,14 @@ class SoupStrainer(object): ...@@ -1668,20 +1704,14 @@ class SoupStrainer(object):
if isinstance(markup, list) or isinstance(markup, tuple): if isinstance(markup, list) or isinstance(markup, tuple):
# This should only happen when searching a multi-valued attribute # This should only happen when searching a multi-valued attribute
# like 'class'. # like 'class'.
if (isinstance(match_against, unicode)
and ' ' in match_against):
# A bit of a special case. If they try to match "foo
# bar" on a multivalue attribute's value, only accept
# the literal value "foo bar"
#
# XXX This is going to be pretty slow because we keep
# splitting match_against. But it shouldn't come up
# too often.
return (whitespace_re.split(match_against) == markup)
else:
for item in markup: for item in markup:
if self._matches(item, match_against): if self._matches(item, match_against):
return True return True
# We didn't match any particular value of the multivalue
# attribute, but maybe we match the attribute value when
# considered as a string.
if self._matches(' '.join(markup), match_against):
return True
return False return False
if match_against is True: if match_against is True:
......
"""Helper classes for tests.""" """Helper classes for tests."""
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
__license__ = "MIT" __license__ = "MIT"
import pickle import pickle
...@@ -137,6 +139,14 @@ class HTMLTreeBuilderSmokeTest(object): ...@@ -137,6 +139,14 @@ class HTMLTreeBuilderSmokeTest(object):
markup.replace(b"\n", b"")) markup.replace(b"\n", b""))
def test_processing_instruction(self): def test_processing_instruction(self):
# We test both Unicode and bytestring to verify that
# process_markup correctly sets processing_instruction_class
# even when the markup is already Unicode and there is no
# need to process anything.
markup = u"""<?PITarget PIContent?>"""
soup = self.soup(markup)
self.assertEqual(markup, soup.decode())
markup = b"""<?PITarget PIContent?>""" markup = b"""<?PITarget PIContent?>"""
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(markup, soup.encode("utf8")) self.assertEqual(markup, soup.encode("utf8"))
...@@ -215,9 +225,22 @@ Hello, world! ...@@ -215,9 +225,22 @@ Hello, world!
self.assertEqual(comment, baz.previous_element) self.assertEqual(comment, baz.previous_element)
def test_preserved_whitespace_in_pre_and_textarea(self): def test_preserved_whitespace_in_pre_and_textarea(self):
"""Whitespace must be preserved in <pre> and <textarea> tags.""" """Whitespace must be preserved in <pre> and <textarea> tags,
self.assertSoupEquals("<pre> </pre>") even if that would mean not prettifying the markup.
self.assertSoupEquals("<textarea> woo </textarea>") """
pre_markup = "<pre> </pre>"
textarea_markup = "<textarea> woo\nwoo </textarea>"
self.assertSoupEquals(pre_markup)
self.assertSoupEquals(textarea_markup)
soup = self.soup(pre_markup)
self.assertEqual(soup.pre.prettify(), pre_markup)
soup = self.soup(textarea_markup)
self.assertEqual(soup.textarea.prettify(), textarea_markup)
soup = self.soup("<textarea></textarea>")
self.assertEqual(soup.textarea.prettify(), "<textarea></textarea>")
def test_nested_inline_elements(self): def test_nested_inline_elements(self):
"""Inline elements can be nested indefinitely.""" """Inline elements can be nested indefinitely."""
...@@ -480,7 +503,9 @@ Hello, world! ...@@ -480,7 +503,9 @@ Hello, world!
hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>' hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
soup = self.soup( soup = self.soup(
hebrew_document, from_encoding="iso8859-8") hebrew_document, from_encoding="iso8859-8")
self.assertEqual(soup.original_encoding, 'iso8859-8') # Some tree builders call it iso8859-8, others call it iso-8859-9.
# That's not a difference we really care about.
assert soup.original_encoding in ('iso8859-8', 'iso-8859-8')
self.assertEqual( self.assertEqual(
soup.encode('utf-8'), soup.encode('utf-8'),
hebrew_document.decode("iso8859-8").encode("utf-8")) hebrew_document.decode("iso8859-8").encode("utf-8"))
...@@ -563,6 +588,11 @@ class XMLTreeBuilderSmokeTest(object): ...@@ -563,6 +588,11 @@ class XMLTreeBuilderSmokeTest(object):
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(markup, soup.encode("utf8")) self.assertEqual(markup, soup.encode("utf8"))
def test_processing_instruction(self):
markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>"""
soup = self.soup(markup)
self.assertEqual(markup, soup.encode("utf8"))
def test_real_xhtml_document(self): def test_real_xhtml_document(self):
"""A real XHTML document should come out *exactly* the same as it went in.""" """A real XHTML document should come out *exactly* the same as it went in."""
markup = b"""<?xml version="1.0" encoding="utf-8"?> markup = b"""<?xml version="1.0" encoding="utf-8"?>
......
...@@ -84,6 +84,33 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): ...@@ -84,6 +84,33 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode()) self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
self.assertEqual(2, len(soup.find_all('p'))) self.assertEqual(2, len(soup.find_all('p')))
def test_reparented_markup_containing_identical_whitespace_nodes(self):
"""Verify that we keep the two whitespace nodes in this
document distinct when reparenting the adjacent <tbody> tags.
"""
markup = '<table> <tbody><tbody><ims></tbody> </table>'
soup = self.soup(markup)
space1, space2 = soup.find_all(string=' ')
tbody1, tbody2 = soup.find_all('tbody')
assert space1.next_element is tbody1
assert tbody2.next_element is space2
def test_reparented_markup_containing_children(self):
markup = '<div><a>aftermath<p><noscript>target</noscript>aftermath</a></p></div>'
soup = self.soup(markup)
noscript = soup.noscript
self.assertEqual("target", noscript.next_element)
target = soup.find(string='target')
# The 'aftermath' string was duplicated; we want the second one.
final_aftermath = soup.find_all(string='aftermath')[-1]
# The <noscript> tag was moved beneath a copy of the <a> tag,
# but the 'target' string within is still connected to the
# (second) 'aftermath' string.
self.assertEqual(final_aftermath, target.next_element)
self.assertEqual(target, final_aftermath.previous_element)
def test_processing_instruction(self): def test_processing_instruction(self):
"""Processing instructions become comments.""" """Processing instructions become comments."""
markup = b"""<?PITarget PIContent?>""" markup = b"""<?PITarget PIContent?>"""
...@@ -96,3 +123,8 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): ...@@ -96,3 +123,8 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
a1, a2 = soup.find_all('a') a1, a2 = soup.find_all('a')
self.assertEqual(a1, a2) self.assertEqual(a1, a2)
assert a1 is not a2 assert a1 is not a2
def test_foster_parenting(self):
markup = b"""<table><td></tbody>A"""
soup = self.soup(markup)
self.assertEqual(u"<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode())
...@@ -35,7 +35,6 @@ try: ...@@ -35,7 +35,6 @@ try:
except ImportError, e: except ImportError, e:
LXML_PRESENT = False LXML_PRESENT = False
PYTHON_2_PRE_2_7 = (sys.version_info < (2,7))
PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
class TestConstructor(SoupTest): class TestConstructor(SoupTest):
...@@ -77,7 +76,7 @@ class TestWarnings(SoupTest): ...@@ -77,7 +76,7 @@ class TestWarnings(SoupTest):
def test_no_warning_if_explicit_parser_specified(self): def test_no_warning_if_explicit_parser_specified(self):
with warnings.catch_warnings(record=True) as w: with warnings.catch_warnings(record=True) as w:
soup = self.soup("<a><b></b></a>", "html.parser") soup = self.soup("<a><b></b></a>", "html.parser")
self.assertEquals([], w) self.assertEqual([], w)
def test_parseOnlyThese_renamed_to_parse_only(self): def test_parseOnlyThese_renamed_to_parse_only(self):
with warnings.catch_warnings(record=True) as w: with warnings.catch_warnings(record=True) as w:
...@@ -118,15 +117,34 @@ class TestWarnings(SoupTest): ...@@ -118,15 +117,34 @@ class TestWarnings(SoupTest):
soup = self.soup(filename) soup = self.soup(filename)
self.assertEqual(0, len(w)) self.assertEqual(0, len(w))
def test_url_warning(self): def test_url_warning_with_bytes_url(self):
with warnings.catch_warnings(record=True) as w: with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup("http://www.crummy.com/") soup = self.soup(b"http://www.crummybytes.com/")
msg = str(w[0].message) # Be aware this isn't the only warning that can be raised during
self.assertTrue("looks like a URL" in msg) # execution..
self.assertTrue(any("looks like a URL" in str(w.message)
for w in warning_list))
def test_url_warning_with_unicode_url(self):
with warnings.catch_warnings(record=True) as warning_list:
# note - this url must differ from the bytes one otherwise
# python's warnings system swallows the second warning
soup = self.soup(u"http://www.crummyunicode.com/")
self.assertTrue(any("looks like a URL" in str(w.message)
for w in warning_list))
def test_url_warning_with_bytes_and_space(self):
with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup(b"http://www.crummybytes.com/ is great")
self.assertFalse(any("looks like a URL" in str(w.message)
for w in warning_list))
def test_url_warning_with_unicode_and_space(self):
with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup(u"http://www.crummyuncode.com/ is great")
self.assertFalse(any("looks like a URL" in str(w.message)
for w in warning_list))
with warnings.catch_warnings(record=True) as w:
soup = self.soup("http://www.crummy.com/ is great")
self.assertEqual(0, len(w))
class TestSelectiveParsing(SoupTest): class TestSelectiveParsing(SoupTest):
...@@ -260,7 +278,7 @@ class TestEncodingConversion(SoupTest): ...@@ -260,7 +278,7 @@ class TestEncodingConversion(SoupTest):
self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
@skipIf( @skipIf(
PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2, PYTHON_3_PRE_3_2,
"Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
def test_attribute_name_containing_unicode_characters(self): def test_attribute_name_containing_unicode_characters(self):
markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>' markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>'
......
...@@ -222,6 +222,17 @@ class TestFindAllByName(TreeTest): ...@@ -222,6 +222,17 @@ class TestFindAllByName(TreeTest):
self.assertSelects( self.assertSelects(
tree.find_all(id_matches_name), ["Match 1.", "Match 2."]) tree.find_all(id_matches_name), ["Match 1.", "Match 2."])
def test_find_with_multi_valued_attribute(self):
soup = self.soup(
"<div class='a b'>1</div><div class='a c'>2</div><div class='a d'>3</div>"
)
r1 = soup.find('div', 'a d');
r2 = soup.find('div', re.compile(r'a d'));
r3, r4 = soup.find_all('div', ['a b', 'a d']);
self.assertEqual('3', r1.string)
self.assertEqual('3', r2.string)
self.assertEqual('1', r3.string)
self.assertEqual('3', r4.string)
class TestFindAllByAttribute(TreeTest): class TestFindAllByAttribute(TreeTest):
...@@ -294,10 +305,10 @@ class TestFindAllByAttribute(TreeTest): ...@@ -294,10 +305,10 @@ class TestFindAllByAttribute(TreeTest):
f = tree.find_all("gar", class_=re.compile("a")) f = tree.find_all("gar", class_=re.compile("a"))
self.assertSelects(f, ["Found it"]) self.assertSelects(f, ["Found it"])
# Since the class is not the string "foo bar", but the two # If the search fails to match the individual strings "foo" and "bar",
# strings "foo" and "bar", this will not find anything. # it will be tried against the combined string "foo bar".
f = tree.find_all("gar", class_=re.compile("o b")) f = tree.find_all("gar", class_=re.compile("o b"))
self.assertSelects(f, []) self.assertSelects(f, ["Found it"])
def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self): def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self):
soup = self.soup("<a class='bar'>Found it</a>") soup = self.soup("<a class='bar'>Found it</a>")
...@@ -335,7 +346,7 @@ class TestFindAllByAttribute(TreeTest): ...@@ -335,7 +346,7 @@ class TestFindAllByAttribute(TreeTest):
strainer = SoupStrainer(attrs={'id' : 'first'}) strainer = SoupStrainer(attrs={'id' : 'first'})
self.assertSelects(tree.find_all(strainer), ['Match.']) self.assertSelects(tree.find_all(strainer), ['Match.'])
def test_find_all_with_missing_atribute(self): def test_find_all_with_missing_attribute(self):
# You can pass in None as the value of an attribute to find_all. # You can pass in None as the value of an attribute to find_all.
# This will match tags that do not have that attribute set. # This will match tags that do not have that attribute set.
tree = self.soup("""<a id="1">ID present.</a> tree = self.soup("""<a id="1">ID present.</a>
...@@ -1328,6 +1339,13 @@ class TestPersistence(SoupTest): ...@@ -1328,6 +1339,13 @@ class TestPersistence(SoupTest):
copied = copy.deepcopy(self.tree) copied = copy.deepcopy(self.tree)
self.assertEqual(copied.decode(), self.tree.decode()) self.assertEqual(copied.decode(), self.tree.decode())
def test_copy_preserves_encoding(self):
soup = BeautifulSoup(b'<p>&nbsp;</p>', 'html.parser')
encoding = soup.original_encoding
copy = soup.__copy__()
self.assertEqual(u"<p> </p>", unicode(copy))
self.assertEqual(encoding, copy.original_encoding)
def test_unicode_pickle(self): def test_unicode_pickle(self):
# A tree containing Unicode characters can be pickled. # A tree containing Unicode characters can be pickled.
html = u"<b>\N{SNOWMAN}</b>" html = u"<b>\N{SNOWMAN}</b>"
...@@ -1676,8 +1694,8 @@ class TestSoupSelector(TreeTest): ...@@ -1676,8 +1694,8 @@ class TestSoupSelector(TreeTest):
def setUp(self): def setUp(self):
self.soup = BeautifulSoup(self.HTML, 'html.parser') self.soup = BeautifulSoup(self.HTML, 'html.parser')
def assertSelects(self, selector, expected_ids): def assertSelects(self, selector, expected_ids, **kwargs):
el_ids = [el['id'] for el in self.soup.select(selector)] el_ids = [el['id'] for el in self.soup.select(selector, **kwargs)]
el_ids.sort() el_ids.sort()
expected_ids.sort() expected_ids.sort()
self.assertEqual(expected_ids, el_ids, self.assertEqual(expected_ids, el_ids,
...@@ -1720,6 +1738,13 @@ class TestSoupSelector(TreeTest): ...@@ -1720,6 +1738,13 @@ class TestSoupSelector(TreeTest):
for selector in ('html div', 'html body div', 'body div'): for selector in ('html div', 'html body div', 'body div'):
self.assertSelects(selector, ['data1', 'main', 'inner', 'footer']) self.assertSelects(selector, ['data1', 'main', 'inner', 'footer'])
def test_limit(self):
self.assertSelects('html div', ['main'], limit=1)
self.assertSelects('html body div', ['inner', 'main'], limit=2)
self.assertSelects('body div', ['data1', 'main', 'inner', 'footer'],
limit=10)
def test_tag_no_match(self): def test_tag_no_match(self):
self.assertEqual(len(self.soup.select('del')), 0) self.assertEqual(len(self.soup.select('del')), 0)
...@@ -1902,6 +1927,14 @@ class TestSoupSelector(TreeTest): ...@@ -1902,6 +1927,14 @@ class TestSoupSelector(TreeTest):
('div[data-tag]', ['data1']) ('div[data-tag]', ['data1'])
) )
def test_quoted_space_in_selector_name(self):
html = """<div style="display: wrong">nope</div>
<div style="display: right">yes</div>
"""
soup = BeautifulSoup(html, 'html.parser')
[chosen] = soup.select('div[style="display: right"]')
self.assertEqual("yes", chosen.string)
def test_unsupported_pseudoclass(self): def test_unsupported_pseudoclass(self):
self.assertRaises( self.assertRaises(
NotImplementedError, self.soup.select, "a:no-such-pseudoclass") NotImplementedError, self.soup.select, "a:no-such-pseudoclass")
......
from .core import where, old_where from .core import where, old_where
__version__ = "2016.09.26" __version__ = "2017.04.17"
This diff is collapsed.
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment