Python中使用lxml解析HTML文件并提取指定节点的示例代码
import requests
from lxml import etree
# 请求HTML页面
url = "http://example.com"
response = requests.get(url)
# 解析HTML页面
html = etree.HTML(response.text)
# 提取指定节点
nodes = html.xpath('//div[@class="content"]')
# 打印节点文本
for node in nodes:
print(node.text)
'''
输出结果示例:
Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Suspendisse hendrerit libero ac dui suscipit, quis molestie quam elementum.
'''
# 以字典的形式提取节点的属性值
links = html.xpath('//a[@class="link"]')
for link in links:
print(link.attrib['href'])
'''
输出结果示例:
http://example.com/link1
http://example.com/link2
'''
# 使用网页的相对路径提取图片链接
imgs = html.xpath('//img/@src')
for img in imgs:
print(url + img)
'''
输出结果示例:
http://example.com/image1.jpg
http://example.com/image2.jpg
'''
# 提取带有指定文本的链接
links_with_text = html.xpath('//a[contains(text(), "example")]')
for link in links_with_text:
print(link.attrib['href'])
'''
输出结果示例:
http://example.com/link1
http://example.com/link2
'''
# 提取指定节点的父节点
parents = html.xpath('//div[@class="content"]/..')
for parent in parents:
print(parent.tag)
'''
输出结果示例:
body
'''
# 提取指定节点的子节点
children = html.xpath('//div[@class="content"]/child::*')
for child in children:
print(child.tag)
'''
输出结果示例:
p
p
'''
# 使用通配符提取所有节点
all_nodes = html.xpath('//div[@class="content"]//*')
for node in all_nodes:
print(node.tag)
'''
输出结果示例:
p
p
'''
# 使用属性过滤器提取指定属性值的节点
filtered_nodes = html.xpath('//div[@class="content"][@data-type="example"]/p')
for node in filtered_nodes:
print(node.text)
'''
输出结果示例:
Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Suspendisse hendrerit libero ac dui suscipit, quis molestie quam elementum.
'''
# 使用轴选择器提取指定关系的节点
ancestor_nodes = html.xpath('//div[@class="content"]/ancestor::*')
for node in ancestor_nodes:
print(node.tag)
'''
输出结果示例:
body
html
'''
# 使用函数计算节点的属性值
lengths = html.xpath('string-length(//div[@class="content"])')
print(lengths)
'''
输出结果示例:
105
'''
# 使用逻辑运算符结合多个条件提取节点
filtered_nodes = html.xpath('//div[@class="content" and @data-type="example"]/p')
for node in filtered_nodes:
print(node.text)
'''
输出结果示例:
Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Suspendisse hendrerit libero ac dui suscipit, quis molestie quam elementum.
'''
