BeautifulSoup使用

import re

from bs4 import BeautifulSoup

html = """
<html><head><title>The Dormouse\'s story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse\'s story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><span>Elsie</span></a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""

soup = BeautifulSoup(html, \'lxml\')
# print(soup.prettify())  # 美化
print(soup.p.attrs)

"""
节点选择器
"""
# 选择元素
print(type(soup.title))  # <class \'bs4.element.Tag\'>
print(soup.title.string)  # The Dormouse\'s story
print(soup.head)  # <head><title>The Dormouse\'s story</title></head>
print(soup.p)  # <p class="title" name="dromouse"><b>The Dormouse\'s story</b></p>

# 提取信息
# 获取名称
print(soup.title.name)  # title
print(soup.p.name)  # p
# 获取属性
print(soup.p.attrs)  # {\'class\': [\'title\'], \'name\': \'dromouse\'}
print(soup.p.attrs[\'name\'])  # dromouse
print(soup.p[\'name\'])  # dromouse
print(soup.p[\'class\'])  # [\'title\']
# 获取内容
print(soup.p.string)  # The Dormouse\'s story

# 嵌套选择
print(soup.head.title.string)  # The Dormouse\'s story

html = """
<html><head><title>The Dormouse\'s story</title></head>
<body>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><span>Elsie</span></a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html, \'lxml\')
# 关联选择
# (1)子节点和子孙节点
# contents属性得到的结果是直接子节点的列表
print(\'(1)子节点和子孙节点\')
print(soup.p.contents)
# children属性也可以得到直接子节点
print(soup.p.children)  # <list_iterator object at 0x7fdf9fa12820>
for i, child in enumerate(soup.p.children):
    print(i, child)  # 0 <b>The Dormouse\'s story</b>
"""
# 获取直接子节点，span是在a里面的
0 Once upon a time there were three little sisters; and their names were

1 <a class="sister" href="http://example.com/elsie" id="link1"><span>Elsie</span></a>
2 

3 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
4  and

5 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
6 
and they lived at the bottom of a well.
"""
# descendants属性获取所有的子孙节点
print(soup.p.descendants)  # <generator object Tag.descendants at 0x7fdf9fa9bb30>
for i, child in enumerate(soup.p.descendants):
    print(i, child)

"""
# a 下的<span>Elsie</span>标签被单独输出
0 Once upon a time there were three little sisters; and their names were

1 <a class="sister" href="http://example.com/elsie" id="link1"><span>Elsie</span></a>
2 <span>Elsie</span>
3 Elsie
4 

5 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
6 Lacie
7  and

8 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
9 Tillie
10 
and they lived at the bottom of a well.
"""
# 父节点和祖父节点
print(soup.a.parent)  # 直接父节点
print(soup.a.parents)  # 所有祖父节点

# 兄弟节点
print(soup.a.next_sibling)  # 上一个兄弟节点
print(soup.a.previous_sibling)  # 下一个兄弟节点
print(soup.a.next_siblings)  # 后面的兄弟节点生成器
print(soup.a.previous_siblings)

# 提取信息
print(soup.a.string)  # 获取文本
print(list(soup.a.parent)[1].attrs[\'class\'])  # 获取属性

# print(soup.find_all(\'p\', attrs={\'class\': \'title\'})[0].get_text())

"""
方法选择器
"""
# find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)
html = \'\'\'
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
\'\'\'
soup = BeautifulSoup(html, \'lxml\')
print(soup.find_all(name=\'ul\')[0])
print(type(soup.find_all(name=\'ul\')[0]))  # <class \'bs4.element.Tag\'>
# 查询出所有ul节点后，再继续查询其内部的li节点
for ul in soup.find_all(name=\'ul\'):
    print(ul.find_all(name=\'li\'))
# [<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>]
# [<li class="element">Foo</li>, <li class="element">Bar</li>]

# attrs 查询属性
print(soup.find_all(attrs={\'id\': \'list-1\'}))
print(soup.find_all(attrs={\'name\': \'elements\'}))
# 常用属性id， class
print(soup.find_all(id=\'list-1\'))
print(soup.find_all(class_=\'element\'))

# text参数进行节点的文本匹配
print(soup.find_all(text=re.compile(\'Foo\')))

# 其它方式：
# soup.find()
# soup.find_parent()
# soup.find_parents()
# soup.find_next_sibling()
# soup.find_next_siblings()
# soup.find_previous_sibling()
# soup.find_all_next()
# soup.find_all_previous()
# ....

"""
css选择器  
"""
print(soup.select(\'.panel .panel-heading\'))
print(soup.select(\'ul li\'))
print(soup.select(\'#list-2 .element\'))
print(soup.select_one(\'ul\')[\'id\'])  # list-1
print(soup.select_one(\'ul\').attrs[\'id\']) # list-1
print(soup.select_one(\'li\').get_text())  # Foo
print(soup.select_one(\'li\').string)  # Foo
本文链接：https://www.cnblogs.com/fly-book/p/15092858.html
BeautifulSoup使用

BeautifulSoup使用的更多相关文章

随机推荐

热门专题

目录导航