XML , , . XML . Etree .
>>> import xml.etree.ElementTree as etree>>> tree = etree.parse('examples/feed.xml')>>> root = tree.getroot()>>> root.findall('{http://www.w3.org/2005/Atom}entry') ①[<Element {http://www.w3.org/2005/Atom}entry at e2b4e0>, <Element {http://www.w3.org/2005/Atom}entry at e2b510>, <Element {http://www.w3.org/2005/Atom}entry at e2b540>]>>> root.tag'{http://www.w3.org/2005/Atom}feed'>>> root.findall('{http://www.w3.org/2005/Atom}feed') ②[]>>> root.findall('{http://www.w3.org/2005/Atom}author') ③[]① findall() . ( .)
② ( ) findall(). . ? , . feed feed, .
③ . XML author; , ( entry). author (direct children) ; ( ). author , .
>>> tree.findall('{http://www.w3.org/2005/Atom}entry') ①[<Element {http://www.w3.org/2005/Atom}entry at e2b4e0>, <Element {http://www.w3.org/2005/Atom}entry at e2b510>, <Element {http://www.w3.org/2005/Atom}entry at e2b540>]>>> tree.findall('{http://www.w3.org/2005/Atom}author') ②[]① tree ( etree.parse()) . tree.getroot().findall().
② , , author . ? , tree.getroot().findall('{http://www.w3.org/2005/Atom}author'), author, . author ; entry. , .
findall() find() . .
|
|
① findall() atom:entry.
② find() ElementTree .
③ foo , find() None.
find(). ElementTree False (. if len(element) 0). if element.find('...') , find() ; ! find() if element.find('...') is not None. |
, . , .
>>> all_links = tree.findall('//{http://www.w3.org/2005/Atom}link') ①>>> all_links[<Element {http://www.w3.org/2005/Atom}link at e181b0>, <Element {http://www.w3.org/2005/Atom}link at e2b570>, <Element {http://www.w3.org/2005/Atom}link at e2b480>, <Element {http://www.w3.org/2005/Atom}link at e2b5a0>]>>> all_links[0].attrib ②{'href': 'http://diveintomark.org/', 'type': 'text/html', 'rel': 'alternate'}>>> all_links[1].attrib ③{'href': 'http://diveintomark.org/archives/2009/03/27/dive-into-history-2009-edition', 'type': 'text/html', 'rel': 'alternate'}>>> all_links[2].attrib{'href': 'http://diveintomark.org/archives/2009/03/21/accessibility-is-a-harsh-mistress', 'type': 'text/html', 'rel': 'alternate'}>>> all_links[3].attrib{'href': 'http://diveintomark.org/archives/2008/12/18/give-part-1-container-formats', 'type': 'text/html', 'rel': 'alternate'}① //{http://www.w3.org/2005/Atom}link . // . // , . , .
② . , , html .
③ entry. entry link. findall() , link.
, findall() ElementTree , . ElementTree XPath. XPath W3C XML . ElementTree XPath . , XPath. XML API ElementTree XPath.
|
|
LXML
lxml libxml2. API ElementTree, XPath 1.0 . Windows ; Linux (, yum apt-get). lxml .
>>> from lxml import etree ①>>> tree = etree.parse('examples/feed.xml') ②>>> root = tree.getroot() ③>>> root.findall('{http://www.w3.org/2005/Atom}entry') ④[<Element {http://www.w3.org/2005/Atom}entry at e2b4e0>, <Element {http://www.w3.org/2005/Atom}entry at e2b510>, <Element {http://www.w3.org/2005/Atom}entry at e2b540>]① lxml API ElementTree.
② parse(): ElementTree.
③ getroot(): .
④ findall(): .
XML lxml ElementTree. API ElementTree , lxml , , ElementTree.
try: from lxml import etreeexcept ImportError: import xml.etree.ElementTree as etree, lxml ElementTree: findall() .
>>> import lxml.etree ①>>> tree = lxml.etree.parse('examples/feed.xml')>>> tree.findall('//{http://www.w3.org/2005/Atom}*[@href]') ②[<Element {http://www.w3.org/2005/Atom}link at eeb8a0>, <Element {http://www.w3.org/2005/Atom}link at eeb990>, <Element {http://www.w3.org/2005/Atom}link at eeb960>, <Element {http://www.w3.org/2005/Atom}link at eeb9c0>]>>> tree.findall("//{http://www.w3.org/2005/Atom}*[@href='http://diveintomark.org/']") ③[<Element {http://www.w3.org/2005/Atom}link at eeb930>]>>> NS = '{http://www.w3.org/2005/Atom}'>>> tree.findall('//{NS}author[{NS}uri]'.format(NS=NS)) ④[<Element {http://www.w3.org/2005/Atom}author at eeba80>, <Element {http://www.w3.org/2005/Atom}author at eebba0>]① lxml.etree ( etree: from lxml import etree) , lxml.
② Atom ( ), href. // , . {http://www.w3.org/2005/Atom} Atom. * . [@href] href.
③ Atom href http://diveintomark.org/.
④ ( ) Atom author Atom uri. 2 author: entry. entry author name, uri.
|
|
? lxml XPath 1.0. XPath, . XPath lxml.
>>> import lxml.etree>>> tree = lxml.etree.parse('examples/feed.xml')>>> NSMAP = {'atom': 'http://www.w3.org/2005/Atom'} ①>>> entries = tree.xpath("//atom:category[@term='accessibility']/..", ②... namespaces=NSMAP)>>> entries ③[<Element {http://www.w3.org/2005/Atom}entry at e2b630>]>>> entry = entries[0]>>> entry.xpath('./atom:title/text()', namespaces=NSMAP) ④['Accessibility is a harsh mistress']① XPath , . Python.
② XPath . category ( Atom) - term='accessibility'. , . /.. ? , . , entry <category term='accessibility'>.
③ xpath() ElementTree. entry term='accessibility'.
④ XPath . , DOM XML , (nodes). , . XPath . : text() title (atom:title) (./).
XML
ElementTree XML , .
>>> import xml.etree.ElementTree as etree>>> new_feed = etree.Element('{http://www.w3.org/2005/Atom}feed', ①... attrib={'{http://www.w3.org/XML/1998/namespace}lang': 'en'}) ②>>> print(etree.tostring(new_feed)) ③<ns0:feed xmlns:ns0='http://www.w3.org/2005/Atom' xml:lang='en'/>① Element. ( ). feed Atom. XML.
② attrib. , ElementTree {_}_.
③ tostring() ElementTree.
new_feed? ElementTree XML , . XML xmlns='http://www.w3.org/2005/Atom'. (, Atom), , , (<feed>, <link>, <entry>). , .
|
|
XML XML . DOM
<ns0:feed xmlns:ns0='http://www.w3.org/2005/Atom' xml:lang='en'/><feed xmlns='http://www.w3.org/2005/Atom' xml:lang='en'/>
, . ns0: , 4 × 79 + 4 , 320 . UTF-8 320 . ( gzip 21 ; 21 21 ). , , Atom, , .
lxml: ElementTree lxml .
>>> import lxml.etree>>> NSMAP = {None: 'http://www.w3.org/2005/Atom'} ①>>> new_feed = lxml.etree.Element('feed', nsmap=NSMAP) ②>>> print(lxml.etree.tounicode(new_feed)) ③<feed xmlns='http://www.w3.org/2005/Atom'/>>>> new_feed.set('{http://www.w3.org/XML/1998/namespace}lang', 'en') ④>>> print(lxml.etree.tounicode(new_feed))<feed xmlns='http://www.w3.org/2005/Atom' xml:lang='en'/>① . ; - . None .
② lxml nsmap, .
③ , Atom feed .
④ , xml:lang. set(), . : ElementTree . ( ElementTree. lxml ElementTree nsmap .)
? , . .
>>> title = lxml.etree.SubElement(new_feed, 'title', ①... attrib={'type':'html'}) ②>>> print(lxml.etree.tounicode(new_feed)) ③<feed xmlns='http://www.w3.org/2005/Atom' xml:lang='en'><title type='html'/></feed>>>> title.text = 'dive into …' ④>>> print(lxml.etree.tounicode(new_feed)) ⑤<feed xmlns='http://www.w3.org/2005/Atom' xml:lang='en'><title type='html'>dive into &hellip;</title></feed>>>> print(lxml.etree.tounicode(new_feed, pretty_print=True)) ⑥<feed xmlns='http://www.w3.org/2005/Atom' xml:lang='en'><title type='html'>dive into&hellip;</title></feed>① SubElement. ( new_feed) . , .
② . , - .
③ , title Atom feed. title , lxml />.
④ , .text.
⑤ title . < ', escape-. lxml .
⑥ (pretty printing), . lxml (insignificant whitespace) XML .
, , xmlwitch, Python with XML . |