跳转到内容

Python 编程/XML 工具

来自维基教科书,开放世界中的开放书籍


Python 包含几个用于操作 xml 的模块。

xml.sax.handler

[编辑 | 编辑源代码]

Python 文档

import xml.sax.handler as saxhandler
import xml.sax as saxparser

class MyReport:
    def __init__(self):
        self.Y = 1


class MyCH(saxhandler.ContentHandler):
    def __init__(self, report):
        self.X = 1
        self.report = report

    def startDocument(self):
        print('startDocument')

    def startElement(self, name, attrs):
        print('Element:', name)

report = MyReport()          #for future use
ch = MyCH(report)

xml = """\
<collection>
  <comic title=\"Sandman\" number='62'>
     <writer>Neil Gaiman</writer>
     <penciller pages='1-9,18-24'>Glyn Dillon</penciller>
     <penciller pages="10-17">Charles Vess</penciller>
  </comic>
</collection>
"""

print(xml)

saxparser.parseString(xml, ch)

xml.dom.minidom

[编辑 | 编辑源代码]

使用 DOM 进行 RSS 提要解析的示例

from xml.dom import minidom as dom
import urllib2

def fetchPage(url):
    a = urllib2.urlopen(url)
    return ''.join(a.readlines())

def extract(page):
    a = dom.parseString(page)
    item = a.getElementsByTagName('item')
    for i in item:
        if i.hasChildNodes():
            t = i.getElementsByTagName('title')[0].firstChild.wholeText
            l = i.getElementsByTagName('link')[0].firstChild.wholeText
            d = i.getElementsByTagName('description')[0].firstChild.wholeText
            print(t, l, d)

if __name__=='__main__':
    page = fetchPage("http://rss.slashdot.org/Slashdot/slashdot")
    extract(page)

XML 文档由 pyxml 文档 提供。

华夏公益教科书