--- BeautifulSoup.py +++ BeautifulSoup.py @@ -76,7 +76,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. """ -from __future__ import generators + __author__ = "Leonard Richardson (leonardr@segfault.org)" __version__ = "3.1.0.1" @@ -84,12 +84,12 @@ __license__ = "New-style BSD" import codecs -import markupbase +import _markupbase import types import re -from HTMLParser import HTMLParser, HTMLParseError +from html.parser import HTMLParser, HTMLParseError try: - from htmlentitydefs import name2codepoint + from html.entities import name2codepoint except ImportError: name2codepoint = {} try: @@ -98,18 +98,18 @@ from sets import Set as set #These hacks make Beautiful Soup able to parse XML with namespaces -markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match +_markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match DEFAULT_OUTPUT_ENCODING = "utf-8" # First, the classes that represent markup elements. -def sob(unicode, encoding): +def sob(str, encoding): """Returns either the given Unicode string or its encoding.""" if encoding is None: - return unicode + return str else: - return unicode.encode(encoding) + return str.encode(encoding) class PageElement: """Contains the navigational information for some part of the page @@ -178,8 +178,8 @@ return lastChild def insert(self, position, newChild): - if (isinstance(newChild, basestring) - or isinstance(newChild, unicode)) \ + if (isinstance(newChild, str) + or isinstance(newChild, str)) \ and not isinstance(newChild, NavigableString): newChild = NavigableString(newChild) @@ -334,7 +334,7 @@ g = generator() while True: try: - i = g.next() + i = g.__next__() except StopIteration: break if i: @@ -385,22 +385,22 @@ def toEncoding(self, s, encoding=None): """Encodes an object to a string in some encoding, or to Unicode. .""" - if isinstance(s, unicode): + if isinstance(s, str): if encoding: s = s.encode(encoding) elif isinstance(s, str): if encoding: s = s.encode(encoding) else: - s = unicode(s) + s = str(s) else: if encoding: s = self.toEncoding(str(s), encoding) else: - s = unicode(s) + s = str(s) return s -class NavigableString(unicode, PageElement): +class NavigableString(str, PageElement): def __new__(cls, value): """Create a new NavigableString. @@ -410,12 +410,12 @@ passed in to the superclass's __new__ or the superclass won't know how to handle non-ASCII characters. """ - if isinstance(value, unicode): - return unicode.__new__(cls, value) - return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + if isinstance(value, str): + return str.__new__(cls, value) + return str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) def __getnewargs__(self): - return (unicode(self),) + return (str(self),) def __getattr__(self, attr): """text.string gives you text. This is for backwards @@ -424,7 +424,7 @@ if attr == 'string': return self else: - raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) + raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)) def encode(self, encoding=DEFAULT_OUTPUT_ENCODING): return self.decode().encode(encoding) @@ -435,23 +435,23 @@ class CData(NavigableString): def decodeGivenEventualEncoding(self, eventualEncoding): - return u'' + return '' class ProcessingInstruction(NavigableString): def decodeGivenEventualEncoding(self, eventualEncoding): output = self - if u'%SOUP-ENCODING%' in output: + if '%SOUP-ENCODING%' in output: output = self.substituteEncoding(output, eventualEncoding) - return u'' + return '' class Comment(NavigableString): def decodeGivenEventualEncoding(self, eventualEncoding): - return u'' + return '' class Declaration(NavigableString): def decodeGivenEventualEncoding(self, eventualEncoding): - return u'' + return '' class Tag(PageElement): @@ -460,7 +460,7 @@ def _invert(h): "Cheap function to invert a hash." i = {} - for k,v in h.items(): + for k,v in list(h.items()): i[v] = k return i @@ -479,23 +479,23 @@ escaped.""" x = match.group(1) if self.convertHTMLEntities and x in name2codepoint: - return unichr(name2codepoint[x]) + return chr(name2codepoint[x]) elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: if self.convertXMLEntities: return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] else: - return u'&%s;' % x + return '&%s;' % x elif len(x) > 0 and x[0] == '#': # Handle numeric entities if len(x) > 1 and x[1] == 'x': - return unichr(int(x[2:], 16)) + return chr(int(x[2:], 16)) else: - return unichr(int(x[1:])) + return chr(int(x[1:])) elif self.escapeUnrecognizedEntities: - return u'&%s;' % x + return '&%s;' % x else: - return u'&%s;' % x + return '&%s;' % x def __init__(self, parser, name, attrs=None, parent=None, previous=None): @@ -524,7 +524,7 @@ return kval return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", self._convertEntities, val)) - self.attrs = map(convert, self.attrs) + self.attrs = list(map(convert, self.attrs)) def get(self, key, default=None): """Returns the value of the 'key' attribute for the tag, or @@ -533,7 +533,7 @@ return self._getAttrMap().get(key, default) def has_key(self, key): - return self._getAttrMap().has_key(key) + return key in self._getAttrMap() def __getitem__(self, key): """tag[key] returns the value of the 'key' attribute for the tag, @@ -551,7 +551,7 @@ def __contains__(self, x): return x in self.contents - def __nonzero__(self): + def __bool__(self): "A tag is non-None even if it has no contents." return True @@ -577,14 +577,14 @@ #We don't break because bad HTML can define the same #attribute multiple times. self._getAttrMap() - if self.attrMap.has_key(key): + if key in self.attrMap: del self.attrMap[key] def __call__(self, *args, **kwargs): """Calling a tag like a function is the same as calling its findAll() method. Eg. tag('a') returns a list of all the A tags found within this tag.""" - return apply(self.findAll, args, kwargs) + return self.findAll(*args, **kwargs) def __getattr__(self, tag): #print "Getattr %s.%s" % (self.__class__, tag) @@ -592,7 +592,7 @@ return self.find(tag[:-3]) elif tag.find('__') != 0: return self.find(tag) - raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag) + raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__, tag)) def __eq__(self, other): """Returns true iff this tag has the same name, the same attributes, @@ -868,7 +868,7 @@ if isinstance(markupName, Tag): markup = markupName markupAttrs = markup - callFunctionWithTagData = callable(self.name) \ + callFunctionWithTagData = hasattr(self.name, '__call__') \ and not isinstance(markupName, Tag) if (not self.name) \ @@ -880,7 +880,7 @@ else: match = True markupAttrMap = None - for attr, matchAgainst in self.attrs.items(): + for attr, matchAgainst in list(self.attrs.items()): if not markupAttrMap: if hasattr(markupAttrs, 'get'): markupAttrMap = markupAttrs @@ -921,16 +921,16 @@ if self._matches(markup, self.text): found = markup else: - raise Exception, "I don't know how to match against a %s" \ - % markup.__class__ + raise Exception("I don't know how to match against a %s" \ + % markup.__class__) return found def _matches(self, markup, matchAgainst): #print "Matching %s against %s" % (markup, matchAgainst) result = False - if matchAgainst == True and type(matchAgainst) == types.BooleanType: + if matchAgainst == True and type(matchAgainst) == bool: result = markup != None - elif callable(matchAgainst): + elif hasattr(matchAgainst, '__call__'): result = matchAgainst(markup) else: #Custom match methods take the tag as an argument, but all @@ -938,7 +938,7 @@ if isinstance(markup, Tag): markup = markup.name if markup is not None and not isString(markup): - markup = unicode(markup) + markup = str(markup) #Now we know that chunk is either a string, or None. if hasattr(matchAgainst, 'match'): # It's a regexp object. @@ -947,10 +947,10 @@ and (markup is not None or not isString(matchAgainst))): result = markup in matchAgainst elif hasattr(matchAgainst, 'items'): - result = markup.has_key(matchAgainst) + result = matchAgainst in markup elif matchAgainst and isString(markup): - if isinstance(markup, unicode): - matchAgainst = unicode(matchAgainst) + if isinstance(markup, str): + matchAgainst = str(matchAgainst) else: matchAgainst = str(matchAgainst) @@ -971,13 +971,13 @@ """Convenience method that works with all 2.x versions of Python to determine whether or not something is listlike.""" return ((hasattr(l, '__iter__') and not isString(l)) - or (type(l) in (types.ListType, types.TupleType))) + or (type(l) in (list, tuple))) def isString(s): """Convenience method that works with all 2.x versions of Python to determine whether or not something is stringlike.""" try: - return isinstance(s, unicode) or isinstance(s, basestring) + return isinstance(s, str) or isinstance(s, str) except NameError: return isinstance(s, str) @@ -989,7 +989,7 @@ for portion in args: if hasattr(portion, 'items'): #It's a map. Merge it. - for k,v in portion.items(): + for k,v in list(portion.items()): built[k] = v elif isList(portion) and not isString(portion): #It's a list. Map each item to the default. @@ -1034,7 +1034,7 @@ object, possibly one with a %SOUP-ENCODING% slot into which an encoding will be plugged later.""" if text[:3] == "xml": - text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" + text = "xml version='1.0' encoding='%SOUP-ENCODING%'" self._toStringSubclass(text, ProcessingInstruction) def handle_comment(self, text): @@ -1044,7 +1044,7 @@ def handle_charref(self, ref): "Handle character references as data." if self.soup.convertEntities: - data = unichr(int(ref)) + data = chr(int(ref)) else: data = '&#%s;' % ref self.handle_data(data) @@ -1056,7 +1056,7 @@ data = None if self.soup.convertHTMLEntities: try: - data = unichr(name2codepoint[ref]) + data = chr(name2codepoint[ref]) except KeyError: pass @@ -1147,7 +1147,7 @@ lambda x: '') ] - ROOT_TAG_NAME = u'[document]' + ROOT_TAG_NAME = '[document]' HTML_ENTITIES = "html" XML_ENTITIES = "xml" @@ -1236,14 +1236,14 @@ def _feed(self, inDocumentEncoding=None, isHTML=False): # Convert the document to Unicode. markup = self.markup - if isinstance(markup, unicode): + if isinstance(markup, str): if not hasattr(self, 'originalEncoding'): self.originalEncoding = None else: dammit = UnicodeDammit\ (markup, [self.fromEncoding, inDocumentEncoding], smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) - markup = dammit.unicode + markup = dammit.str self.originalEncoding = dammit.originalEncoding self.declaredHTMLEncoding = dammit.declaredHTMLEncoding if markup: @@ -1269,8 +1269,8 @@ def isSelfClosingTag(self, name): """Returns true iff the given string is the name of a self-closing tag according to this parser.""" - return self.SELF_CLOSING_TAGS.has_key(name) \ - or self.instanceSelfClosingTags.has_key(name) + return name in self.SELF_CLOSING_TAGS \ + or name in self.instanceSelfClosingTags def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) @@ -1305,7 +1305,7 @@ def endData(self, containerClass=NavigableString): if self.currentData: - currentData = u''.join(self.currentData) + currentData = ''.join(self.currentData) if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and not set([tag.name for tag in self.tagStack]).intersection( self.PRESERVE_WHITESPACE_TAGS)): @@ -1368,7 +1368,7 @@ nestingResetTriggers = self.NESTABLE_TAGS.get(name) isNestable = nestingResetTriggers != None - isResetNesting = self.RESET_NESTING_TAGS.has_key(name) + isResetNesting = name in self.RESET_NESTING_TAGS popTo = None inclusive = True for i in range(len(self.tagStack)-1, 0, -1): @@ -1381,7 +1381,7 @@ if (nestingResetTriggers != None and p.name in nestingResetTriggers) \ or (nestingResetTriggers == None and isResetNesting - and self.RESET_NESTING_TAGS.has_key(p.name)): + and p.name in self.RESET_NESTING_TAGS): #If we encounter one of the nesting reset triggers #peculiar to this tag, or we encounter another tag @@ -1399,7 +1399,7 @@ if self.quoteStack: #This is not a real tag. #print "<%s> is not real!" % name - attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) + attrs = ''.join([' %s="%s"' % (x_y[0], x_y[1]) for x_y in attrs]) self.handle_data('<%s%s>' % (name, attrs)) return self.endData() @@ -1493,7 +1493,7 @@ BeautifulStoneSoup before writing your own subclass.""" def __init__(self, *args, **kwargs): - if not kwargs.has_key('smartQuotesTo'): + if 'smartQuotesTo' not in kwargs: kwargs['smartQuotesTo'] = self.HTML_ENTITIES kwargs['isHTML'] = True BeautifulStoneSoup.__init__(self, *args, **kwargs) @@ -1677,7 +1677,7 @@ parent._getAttrMap() if (isinstance(tag, Tag) and len(tag.contents) == 1 and isinstance(tag.contents[0], NavigableString) and - not parent.attrMap.has_key(tag.name)): + tag.name not in parent.attrMap): parent[tag.name] = tag.contents[0] BeautifulStoneSoup.popTag(self) @@ -1751,9 +1751,9 @@ self._detectEncoding(markup, isHTML) self.smartQuotesTo = smartQuotesTo self.triedEncodings = [] - if markup == '' or isinstance(markup, unicode): + if markup == '' or isinstance(markup, str): self.originalEncoding = None - self.unicode = unicode(markup) + self.str = str(markup) return u = None @@ -1766,7 +1766,7 @@ if u: break # If no luck and we have auto-detection library, try that: - if not u and chardet and not isinstance(self.markup, unicode): + if not u and chardet and not isinstance(self.markup, str): u = self._convertFrom(chardet.detect(self.markup)['encoding']) # As a last resort, try utf-8 and windows-1252: @@ -1775,7 +1775,7 @@ u = self._convertFrom(proposed_encoding) if u: break - self.unicode = u + self.str = u if not u: self.originalEncoding = None def _subMSChar(self, match): @@ -1783,7 +1783,7 @@ entity.""" orig = match.group(1) sub = self.MS_CHARS.get(orig) - if type(sub) == types.TupleType: + if type(sub) == tuple: if self.smartQuotesTo == 'xml': sub = '&#x'.encode() + sub[1].encode() + ';'.encode() else: @@ -1804,7 +1804,7 @@ if self.smartQuotesTo and proposed.lower() in("windows-1252", "iso-8859-1", "iso-8859-2"): - smart_quotes_re = "([\x80-\x9f])" + smart_quotes_re = b"([\x80-\x9f])" smart_quotes_compiled = re.compile(smart_quotes_re) markup = smart_quotes_compiled.sub(self._subMSChar, markup) @@ -1813,7 +1813,7 @@ u = self._toUnicode(markup, proposed) self.markup = u self.originalEncoding = proposed - except Exception, e: + except Exception as e: # print "That didn't work!" # print e return None @@ -1842,7 +1842,7 @@ elif data[:4] == '\xff\xfe\x00\x00': encoding = 'utf-32le' data = data[4:] - newdata = unicode(data, encoding) + newdata = str(data, encoding) return newdata def _detectEncoding(self, xml_data, isHTML=False): @@ -1855,41 +1855,41 @@ elif xml_data[:4] == '\x00\x3c\x00\x3f': # UTF-16BE sniffed_xml_encoding = 'utf-16be' - xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') + xml_data = str(xml_data, 'utf-16be').encode('utf-8') elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ and (xml_data[2:4] != '\x00\x00'): # UTF-16BE with BOM sniffed_xml_encoding = 'utf-16be' - xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') + xml_data = str(xml_data[2:], 'utf-16be').encode('utf-8') elif xml_data[:4] == '\x3c\x00\x3f\x00': # UTF-16LE sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') + xml_data = str(xml_data, 'utf-16le').encode('utf-8') elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ (xml_data[2:4] != '\x00\x00'): # UTF-16LE with BOM sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') + xml_data = str(xml_data[2:], 'utf-16le').encode('utf-8') elif xml_data[:4] == '\x00\x00\x00\x3c': # UTF-32BE sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') + xml_data = str(xml_data, 'utf-32be').encode('utf-8') elif xml_data[:4] == '\x3c\x00\x00\x00': # UTF-32LE sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') + xml_data = str(xml_data, 'utf-32le').encode('utf-8') elif xml_data[:4] == '\x00\x00\xfe\xff': # UTF-32BE with BOM sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') + xml_data = str(xml_data[4:], 'utf-32be').encode('utf-8') elif xml_data[:4] == '\xff\xfe\x00\x00': # UTF-32LE with BOM sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') + xml_data = str(xml_data[4:], 'utf-32le').encode('utf-8') elif xml_data[:3] == '\xef\xbb\xbf': # UTF-8 with BOM sniffed_xml_encoding = 'utf-8' - xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') + xml_data = str(xml_data[3:], 'utf-8').encode('utf-8') else: sniffed_xml_encoding = 'ascii' pass @@ -1954,41 +1954,41 @@ 250,251,252,253,254,255) import string c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ - ''.join(map(chr, range(256))), ''.join(map(chr, emap))) + ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap))) return s.translate(c.EBCDIC_TO_ASCII_MAP) - MS_CHARS = { '\x80' : ('euro', '20AC'), - '\x81' : ' ', - '\x82' : ('sbquo', '201A'), - '\x83' : ('fnof', '192'), - '\x84' : ('bdquo', '201E'), - '\x85' : ('hellip', '2026'), - '\x86' : ('dagger', '2020'), - '\x87' : ('Dagger', '2021'), - '\x88' : ('circ', '2C6'), - '\x89' : ('permil', '2030'), - '\x8A' : ('Scaron', '160'), - '\x8B' : ('lsaquo', '2039'), - '\x8C' : ('OElig', '152'), - '\x8D' : '?', - '\x8E' : ('#x17D', '17D'), - '\x8F' : '?', - '\x90' : '?', - '\x91' : ('lsquo', '2018'), - '\x92' : ('rsquo', '2019'), - '\x93' : ('ldquo', '201C'), - '\x94' : ('rdquo', '201D'), - '\x95' : ('bull', '2022'), - '\x96' : ('ndash', '2013'), - '\x97' : ('mdash', '2014'), - '\x98' : ('tilde', '2DC'), - '\x99' : ('trade', '2122'), - '\x9a' : ('scaron', '161'), - '\x9b' : ('rsaquo', '203A'), - '\x9c' : ('oelig', '153'), - '\x9d' : '?', - '\x9e' : ('#x17E', '17E'), - '\x9f' : ('Yuml', ''),} + MS_CHARS = { b'\x80' : ('euro', '20AC'), + b'\x81' : ' ', + b'\x82' : ('sbquo', '201A'), + b'\x83' : ('fnof', '192'), + b'\x84' : ('bdquo', '201E'), + b'\x85' : ('hellip', '2026'), + b'\x86' : ('dagger', '2020'), + b'\x87' : ('Dagger', '2021'), + b'\x88' : ('circ', '2C6'), + b'\x89' : ('permil', '2030'), + b'\x8A' : ('Scaron', '160'), + b'\x8B' : ('lsaquo', '2039'), + b'\x8C' : ('OElig', '152'), + b'\x8D' : '?', + b'\x8E' : ('#x17D', '17D'), + b'\x8F' : '?', + b'\x90' : '?', + b'\x91' : ('lsquo', '2018'), + b'\x92' : ('rsquo', '2019'), + b'\x93' : ('ldquo', '201C'), + b'\x94' : ('rdquo', '201D'), + b'\x95' : ('bull', '2022'), + b'\x96' : ('ndash', '2013'), + b'\x97' : ('mdash', '2014'), + b'\x98' : ('tilde', '2DC'), + b'\x99' : ('trade', '2122'), + b'\x9a' : ('scaron', '161'), + b'\x9b' : ('rsaquo', '203A'), + b'\x9c' : ('oelig', '153'), + b'\x9d' : '?', + b'\x9e' : ('#x17E', '17E'), + b'\x9f' : ('Yuml', ''),} ####################################################################### @@ -1997,4 +1997,4 @@ if __name__ == '__main__': import sys soup = BeautifulSoup(sys.stdin) - print soup.prettify() + print(soup.prettify()) --- BeautifulSoupTests.py +++ BeautifulSoupTests.py @@ -82,7 +82,7 @@ def testFindAllText(self): soup = BeautifulSoup("\xbb") self.assertEqual(soup.findAll(text=re.compile('.*')), - [u'\xbb']) + ['\xbb']) def testFindAllByRE(self): import re @@ -215,7 +215,7 @@ soup = BeautifulSoup(self.x, parseOnlyThese=strainer) self.assertEquals(len(soup), 10) - strainer = SoupStrainer(text=lambda(x):x[8]=='3') + strainer = SoupStrainer(text=lambda x:x[8]=='3') soup = BeautifulSoup(self.x, parseOnlyThese=strainer) self.assertEquals(len(soup), 3) @@ -256,7 +256,7 @@ self.assertEqual(copied.decode(), self.soup.decode()) def testUnicodePickle(self): - import cPickle as pickle + import pickle as pickle html = "" + chr(0xc3) + "" soup = BeautifulSoup(html) dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) @@ -586,23 +586,23 @@ self.assertEquals(soup.decode(), "<>") soup = BeautifulStoneSoup(text, convertEntities=htmlEnt) - self.assertEquals(soup.decode(), u"<>") + self.assertEquals(soup.decode(), "<>") # Make sure the "XML", "HTML", and "XHTML" settings work. text = "<™'" soup = BeautifulStoneSoup(text, convertEntities=xmlEnt) - self.assertEquals(soup.decode(), u"<™'") + self.assertEquals(soup.decode(), "<™'") soup = BeautifulStoneSoup(text, convertEntities=htmlEnt) - self.assertEquals(soup.decode(), u"<\u2122'") + self.assertEquals(soup.decode(), "<\u2122'") soup = BeautifulStoneSoup(text, convertEntities=xhtmlEnt) - self.assertEquals(soup.decode(), u"<\u2122'") + self.assertEquals(soup.decode(), "<\u2122'") def testNonBreakingSpaces(self): soup = BeautifulSoup("  ", convertEntities=BeautifulStoneSoup.HTML_ENTITIES) - self.assertEquals(soup.decode(), u"\xa0\xa0") + self.assertEquals(soup.decode(), "\xa0\xa0") def testWhitespaceInDeclaration(self): self.assertSoupEquals('', '') @@ -617,27 +617,27 @@ self.assertSoupEquals('hello there') def testEntitiesInAttributeValues(self): - self.assertSoupEquals('', '', + self.assertSoupEquals('', b'', encoding='utf-8') - self.assertSoupEquals('', '', + self.assertSoupEquals('', b'', encoding='utf-8') soup = BeautifulSoup('', convertEntities=BeautifulStoneSoup.HTML_ENTITIES) - self.assertEquals(soup.decode(), u'') + self.assertEquals(soup.decode(), '') uri = "http://crummy.com?sacré&bleu" link = '' % uri soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES) self.assertEquals(soup.decode(), - link.replace("é", u"\xe9")) + link.replace("é", "\xe9")) uri = "http://crummy.com?sacré&bleu" link = '' % uri soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES) self.assertEquals(soup.a['href'], - uri.replace("é", u"\xe9")) + uri.replace("é", "\xe9")) def testNakedAmpersands(self): html = {'convertEntities':BeautifulStoneSoup.HTML_ENTITIES} @@ -663,13 +663,13 @@ smart quote fixes.""" def testUnicodeDammitStandalone(self): - markup = "\x92" + markup = b"\x92" dammit = UnicodeDammit(markup) - self.assertEquals(dammit.unicode, "") + self.assertEquals(dammit.str, "") - hebrew = "\xed\xe5\xec\xf9" + hebrew = b"\xed\xe5\xec\xf9" dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) - self.assertEquals(dammit.unicode, u'\u05dd\u05d5\u05dc\u05e9') + self.assertEquals(dammit.str, '\u05dd\u05d5\u05dc\u05e9') self.assertEquals(dammit.originalEncoding, 'iso-8859-8') def testGarbageInGarbageOut(self): @@ -677,13 +677,13 @@ asciiSoup = BeautifulStoneSoup(ascii) self.assertEquals(ascii, asciiSoup.decode()) - unicodeData = u"\u00FC" + unicodeData = "\u00FC" utf8 = unicodeData.encode("utf-8") - self.assertEquals(utf8, '\xc3\xbc') + self.assertEquals(utf8, b'\xc3\xbc') unicodeSoup = BeautifulStoneSoup(unicodeData) self.assertEquals(unicodeData, unicodeSoup.decode()) - self.assertEquals(unicodeSoup.foo.string, u'\u00FC') + self.assertEquals(unicodeSoup.foo.string, '\u00FC') utf8Soup = BeautifulStoneSoup(utf8, fromEncoding='utf-8') self.assertEquals(utf8, utf8Soup.encode('utf-8')) @@ -696,18 +696,18 @@ def testHandleInvalidCodec(self): for bad_encoding in ['.utf8', '...', 'utF---16.!']: - soup = BeautifulSoup(u"Räksmörgås".encode("utf-8"), + soup = BeautifulSoup("Räksmörgås".encode("utf-8"), fromEncoding=bad_encoding) self.assertEquals(soup.originalEncoding, 'utf-8') def testUnicodeSearch(self): - html = u'

Räksmörgås

' + html = '

Räksmörgås

' soup = BeautifulSoup(html) - self.assertEqual(soup.find(text=u'Räksmörgås'),u'Räksmörgås') + self.assertEqual(soup.find(text='Räksmörgås'),'Räksmörgås') def testRewrittenXMLHeader(self): - euc_jp = '\n\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n\n' - utf8 = "\n\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n\n" + euc_jp = b'\n\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n\n' + utf8 = b"\n\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n\n" soup = BeautifulStoneSoup(euc_jp) if soup.originalEncoding != "euc-jp": raise Exception("Test failed when parsing euc-jp document. " @@ -718,12 +718,12 @@ self.assertEquals(soup.originalEncoding, "euc-jp") self.assertEquals(soup.renderContents('utf-8'), utf8) - old_text = "\x92" + old_text = b"\x92" new_text = "" self.assertSoupEquals(old_text, new_text) def testRewrittenMetaTag(self): - no_shift_jis_html = '''\n
\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n
''' + no_shift_jis_html = b'''\n
\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n
''' soup = BeautifulSoup(no_shift_jis_html) # Beautiful Soup used to try to rewrite the meta tag even if the @@ -733,16 +733,16 @@ soup = BeautifulSoup(no_shift_jis_html, parseOnlyThese=strainer) self.assertEquals(soup.contents[0].name, 'pre') - meta_tag = ('') + meta_tag = (b'') shift_jis_html = ( - '\n%s\n' - '' - '
\n'
-            '\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
-            '\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
-            '\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n'
-            '
') % meta_tag + b'\n' + meta_tag + b'\n' + b'' + b'
\n'
+            b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
+            b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
+            b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n'
+            b'
') soup = BeautifulSoup(shift_jis_html) if soup.originalEncoding != "shift-jis": raise Exception("Test failed when parsing shift-jis document " @@ -755,59 +755,59 @@ content_type_tag = soup.meta['content'] self.assertEquals(content_type_tag[content_type_tag.find('charset='):], 'charset=%SOUP-ENCODING%') - content_type = str(soup.meta) + content_type = soup.meta.decode() index = content_type.find('charset=') self.assertEqual(content_type[index:index+len('charset=utf8')+1], 'charset=utf-8') content_type = soup.meta.encode('shift-jis') - index = content_type.find('charset=') + index = content_type.find(b'charset=') self.assertEqual(content_type[index:index+len('charset=shift-jis')], 'charset=shift-jis'.encode()) self.assertEquals(soup.encode('utf-8'), ( - '\n' - '\n' - '' - '
\n'
-                '\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3'
-                '\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3'
-                '\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6'
-                '\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3'
-                '\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n'
-                '
')) + b'\n' + b'\n' + b'' + b'
\n'
+                b'\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3'
+                b'\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3'
+                b'\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6'
+                b'\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3'
+                b'\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n'
+                b'
')) self.assertEquals(soup.encode("shift-jis"), shift_jis_html.replace('x-sjis'.encode(), 'shift-jis'.encode())) - isolatin = """Sacr\xe9 bleu!""" + isolatin = b"""Sacr\xe9 bleu!""" soup = BeautifulSoup(isolatin) utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode()) - utf8 = utf8.replace("\xe9", "\xc3\xa9") + utf8 = utf8.replace(b"\xe9", b"\xc3\xa9") self.assertSoupEquals(soup.encode("utf-8"), utf8, encoding='utf-8') def testHebrew(self): - iso_8859_8= '\nHebrew (ISO 8859-8) in Visual Directionality\n\n\n\n\n\n

Hebrew (ISO 8859-8) in Visual Directionality

\n\xed\xe5\xec\xf9\n\n' - utf8 = '\nHebrew (ISO 8859-8) in Visual Directionality\n\n\n

Hebrew (ISO 8859-8) in Visual Directionality

\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n\n' + iso_8859_8= b'\nHebrew (ISO 8859-8) in Visual Directionality\n\n\n\n\n\n

Hebrew (ISO 8859-8) in Visual Directionality

\n\xed\xe5\xec\xf9\n\n' + utf8 = b'\nHebrew (ISO 8859-8) in Visual Directionality\n\n\n

Hebrew (ISO 8859-8) in Visual Directionality

\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n\n' soup = BeautifulStoneSoup(iso_8859_8, fromEncoding="iso-8859-8") self.assertEquals(soup.encode('utf-8'), utf8) def testSmartQuotesNotSoSmartAnymore(self): - self.assertSoupEquals("\x91Foo\x92 ", + self.assertSoupEquals(b"\x91Foo\x92 ", '‘Foo’ ') def testDontConvertSmartQuotesWhenAlsoConvertingEntities(self): - smartQuotes = "Il a dit, \x8BSacré bleu!\x9b" + smartQuotes = b"Il a dit, \x8BSacré bleu!\x9b" soup = BeautifulSoup(smartQuotes) self.assertEquals(soup.decode(), 'Il a dit, ‹Sacré bleu!›') soup = BeautifulSoup(smartQuotes, convertEntities="html") self.assertEquals(soup.encode('utf-8'), - 'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba') + b'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba') def testDontSeeSmartQuotesWhereThereAreNone(self): - utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch" + utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" self.assertSoupEquals(utf_8, encoding='utf-8') --- setup.py +++ setup.py @@ -19,19 +19,19 @@ suite = loader.loadTestsFromModule(BeautifulSoupTests) suite.run(result) if not result.wasSuccessful(): - print "Unit tests have failed!" + print("Unit tests have failed!") for l in result.errors, result.failures: for case, error in l: - print "-" * 80 + print("-" * 80) desc = case.shortDescription() if desc: - print desc - print error - print '''If you see an error like: "'ascii' codec can't encode character...", see\nthe Beautiful Soup documentation:\n http://www.crummy.com/software/BeautifulSoup/documentation.html#Why%20can't%20Beautiful%20Soup%20print%20out%20the%20non-ASCII%20characters%20I%20gave%20it?''' - print "This might or might not be a problem depending on what you plan to do with\nBeautiful Soup." + print(desc) + print(error) + print('''If you see an error like: "'ascii' codec can't encode character...", see\nthe Beautiful Soup documentation:\n http://www.crummy.com/software/BeautifulSoup/documentation.html#Why%20can't%20Beautiful%20Soup%20print%20out%20the%20non-ASCII%20characters%20I%20gave%20it?''') + print("This might or might not be a problem depending on what you plan to do with\nBeautiful Soup.") if sys.argv[1] == 'sdist': - print - print "I'm not going to make a source distribution since the tests don't pass." + print() + print("I'm not going to make a source distribution since the tests don't pass.") sys.exit(1) setup(name="BeautifulSoup",