BeautifulSoup使用
import re
from bs4 import BeautifulSoup
html = """
<html><head><title>The Dormouse\'s story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse\'s story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><span>Elsie</span></a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html, \'lxml\')
# print(soup.prettify()) # 美化
print(soup.p.attrs)
"""
节点选择器
"""
# 选择元素
print(type(soup.title)) # <class \'bs4.element.Tag\'>
print(soup.title.string) # The Dormouse\'s story
print(soup.head) # <head><title>The Dormouse\'s story</title></head>
print(soup.p) # <p class="title" name="dromouse"><b>The Dormouse\'s story</b></p>
# 提取信息
# 获取名称
print(soup.title.name) # title
print(soup.p.name) # p
# 获取属性
print(soup.p.attrs) # {\'class\': [\'title\'], \'name\': \'dromouse\'}
print(soup.p.attrs[\'name\']) # dromouse
print(soup.p[\'name\']) # dromouse
print(soup.p[\'class\']) # [\'title\']
# 获取内容
print(soup.p.string) # The Dormouse\'s story
# 嵌套选择
print(soup.head.title.string) # The Dormouse\'s story
html = """
<html><head><title>The Dormouse\'s story</title></head>
<body>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><span>Elsie</span></a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html, \'lxml\')
# 关联选择
# (1)子节点和子孙节点
# contents属性得到的结果是直接子节点的列表
print(\'(1)子节点和子孙节点\')
print(soup.p.contents)
# children属性也可以得到直接子节点
print(soup.p.children) # <list_iterator object at 0x7fdf9fa12820>
for i, child in enumerate(soup.p.children):
print(i, child) # 0 <b>The Dormouse\'s story</b>
"""
# 获取直接子节点,span是在a里面的
0 Once upon a time there were three little sisters; and their names were
1 <a class="sister" href="http://example.com/elsie" id="link1"><span>Elsie</span></a>
2
3 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
4 and
5 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
6
and they lived at the bottom of a well.
"""
# descendants属性获取所有的子孙节点
print(soup.p.descendants) # <generator object Tag.descendants at 0x7fdf9fa9bb30>
for i, child in enumerate(soup.p.descendants):
print(i, child)
"""
# a 下的<span>Elsie</span>标签被单独输出
0 Once upon a time there were three little sisters; and their names were
1 <a class="sister" href="http://example.com/elsie" id="link1"><span>Elsie</span></a>
2 <span>Elsie</span>
3 Elsie
4
5 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
6 Lacie
7 and
8 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
9 Tillie
10
and they lived at the bottom of a well.
"""
# 父节点和祖父节点
print(soup.a.parent) # 直接父节点
print(soup.a.parents) # 所有祖父节点
# 兄弟节点
print(soup.a.next_sibling) # 上一个兄弟节点
print(soup.a.previous_sibling) # 下一个兄弟节点
print(soup.a.next_siblings) # 后面的兄弟节点生成器
print(soup.a.previous_siblings)
# 提取信息
print(soup.a.string) # 获取文本
print(list(soup.a.parent)[1].attrs[\'class\']) # 获取属性
# print(soup.find_all(\'p\', attrs={\'class\': \'title\'})[0].get_text())
"""
方法选择器
"""
# find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)
html = \'\'\'
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
\'\'\'
soup = BeautifulSoup(html, \'lxml\')
print(soup.find_all(name=\'ul\')[0])
print(type(soup.find_all(name=\'ul\')[0])) # <class \'bs4.element.Tag\'>
# 查询出所有ul节点后,再继续查询其内部的li节点
for ul in soup.find_all(name=\'ul\'):
print(ul.find_all(name=\'li\'))
# [<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>]
# [<li class="element">Foo</li>, <li class="element">Bar</li>]
# attrs 查询属性
print(soup.find_all(attrs={\'id\': \'list-1\'}))
print(soup.find_all(attrs={\'name\': \'elements\'}))
# 常用属性id, class
print(soup.find_all(id=\'list-1\'))
print(soup.find_all(class_=\'element\'))
# text参数进行节点的文本匹配
print(soup.find_all(text=re.compile(\'Foo\')))
# 其它方式:
# soup.find()
# soup.find_parent()
# soup.find_parents()
# soup.find_next_sibling()
# soup.find_next_siblings()
# soup.find_previous_sibling()
# soup.find_all_next()
# soup.find_all_previous()
# ....
"""
css选择器
"""
print(soup.select(\'.panel .panel-heading\'))
print(soup.select(\'ul li\'))
print(soup.select(\'#list-2 .element\'))
print(soup.select_one(\'ul\')[\'id\']) # list-1
print(soup.select_one(\'ul\').attrs[\'id\']) # list-1
print(soup.select_one(\'li\').get_text()) # Foo
print(soup.select_one(\'li\').string) # Foo
版权声明:本文为fly-book原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。