BeautifulSoup-KeepStudy

BeautifulSoup

## 安装 BeautifulSoup
    pip install beautifulsoup4
    pip install lxml  ## 推荐使用 lxml 作为解析器（速度更快）
    pip install html5lib ##  html5lib的解析方式与浏览器相同

#主要的解析器
    Python标准库：BeautifulSoup(markup, "html.parser") Python的内置标准库、执行速度适中、文档容错能力强
    lxml HTML解析器：	BeautifulSoup(markup, "lxml")	速度快、文档容错能力强、需要安装C语言库
    lxml XML 解析器：BeautifulSoup(markup, "xml") 速度快、唯一支持XML的解析器、需要安装C语言库
    html5lib：BeautifulSoup(markup, "html5lib") 最好的容错性、以浏览器的方式解析文档、生成HTML5格式的文档、速度慢

## 属性与方法
 BeautifulSoup()	 一个 BeautifulSoup 对象。	 soup = BeautifulSoup(html_doc, 'html.parser')
 .prettify()	 格式化并美化文档内容，生成结构化的字符串。	 print(soup.prettify())
 .find()	 查找第一个匹配的标签。	 tag = soup.find('a')
 .find_all()	 查找所有匹配的标签，返回一个列表。	 tags = soup.find_all('a')
 .find_all_next()	 查找当前标签后所有符合条件的标签。	 tags = soup.find('div').find_all_next('p')
 .find_all_previous()	 查找当前标签前所有符合条件的标签。	 tags = soup.find('div').find_all_previous('p')
 .find_parent()	 返回当前标签的父标签。	 parent = tag.find_parent()
 .find_all_parents()	 查找当前标签的所有父标签。	 parents = tag.find_all_parents()
 .find_next_sibling()	 查找当前标签的下一个兄弟标签。	 next_sibling = tag.find_next_sibling()
 .find_previous_sibling() 查找当前标签的前一个兄弟标签。	 prev_sibling = tag.find_previous_sibling()
 .get_text()	 提取标签内的文本内容，忽略所有HTML标签。	 text = tag.get_text()
 .decompose()	 从树中删除当前标签及其内容。	 tag.decompose()
 .unwrap()	 移除标签本身，只保留其子内容。	 tag.unwrap()
 .insert()	 向标签内插入新标签或文本。	 tag.insert(0, new_tag)
 .insert_before()	 在当前标签前插入新标签。	 tag.insert_before(new_tag)
 .insert_after()	 在当前标签后插入新标签。	 tag.insert_after(new_tag)
 .extract()	 删除标签并返回该标签。 extracted_tag = tag.extract()
 .replace_with()	 替换当前标签及其内容。	 tag.replace_with(new_tag)
 .has_attr()	 检查标签是否有指定的属性。	 if tag.has_attr('href'):
 .get()	 获取指定属性的值。	 href = tag.get('href')
 .clear()	 清空标签的所有内容。	 tag.clear()
 .encode()	 编码标签内容为字节流。	 encoded = tag.encode()
 .is_ancestor_of()	 检查当前标签是否是指定标签的祖先元素。	 if tag.is_ancestor_of(another_tag):
 .is_descendant_of()	 检查当前标签是否是指定标签的后代元素。	 if tag.is_descendant_of(another_tag):
 str(soup) 压缩输出，返回UTF-8编码的字符串，可以指定 编码 的设置.
 unicode(soup.a) 压缩输出，获得字节码或调用 decode() 方法获得Unicode
 
 
 .parent	 获取当前标签的父标签。	 parent = tag.parent
 .next_sibling	 获取当前标签的下一个兄弟标签。	 next_sibling = tag.next_sibling
 .previous_sibling	 获取当前标签的前一个兄弟标签。	 prev_sibling = tag.previous_sibling
 .attrs	 返回标签的所有属性，以字典形式表示。	 href = tag.attrs['href']
 .string	 获取标签内的字符串内容。	 string_content = tag.string
 .name	 返回标签的名称。	 tag_name = tag.name
 .contents	 返回标签的所有子元素，以列表形式返回。	 children = tag.contents
 .descendants	 返回标签的所有后代元素，生成器形式。	 for child in tag.descendants: print(child)
 .previous_element	 获取当前标签的前一个元素（不包括文本）。	 prev_elem = tag.previous_element
 .next_element	 获取当前标签的下一个元素（不包括文本）。	 next_elem = tag.next_element
 .is_empty_element	 检查标签是否是空元素（例如 、<img> 等）。	if tag.is_empty_element:
 
## 其他属性
 .style	 获取标签的内联样式。	 style = tag['style']
 .id	 获取标签的 id 属性。	 id = tag['id']
 .class_	 获取标签的 class 属性。	class_name = tag['class']
 .string	 获取标签内部的字符串内容。	content = tag.string

## 其他
    find_all(string)  使用字符串查找匹配的标签。	tag = soup.find_all('div', class_='container')
    find_all(id)	  查找指定 id 的标签。	    tag = soup.find_all(id='main')
    find_all(attrs)	  查找具有指定属性的标签。	tag = soup.find_all(attrs={"href": "http://example.com"})

## 对象的种类
 Tag： Tag 对象与XML或HTML原生文档中的tag相同
 soup = BeautifulSoup('Extremely bold')
 tag = soup.b

Name：每个tag都有自己的名字,通过 .name 来获取
        tag.name
        tag.name = "blockquote"

Attributes：一个tag可能有很多个属性 ag的属性的操作方法与字典相同
        tag['class']
        tag.attrs      #”点”取属性
        tag['class'] = 'verybold' #增加
        del tag['class'] #删除

## tag的名字
 soup.head
 soup.title
 soup.body.b #获取<body>标签中的第一个标签
 soup.a #通过点取属性的方式只能获得当前名字的第一个tag
 soup.find_all('a') #得到所有的<a>标签,
 
 head_tag = soup.head
 head_tag.contents
 title_tag = head_tag.contents[0]
 len(soup.contents)
 len(list(soup.children))
 len(list(soup.descendants))

for child in title_tag.children:
        print(child)

## .descendants 属性可以对所有tag的子孙节点进行递归循环
    for child in head_tag.descendants:
        print(child)

## 正则表达式
 如果传入正则表达式作为参数,Beautiful Soup会通过正则表达式的 search() 来匹配内容.下面例子中找出所有以b开头的标签,这表示<body>和标签都应该被找到: 
 import re
 for tag in soup.find_all(re.compile("^b")):
 print(tag.name)

## 列表
 如果传入列表参数,Beautiful Soup会将与列表中任一元素匹配的内容返回.下面代码找到文档中所有<a>标签和标签:
 soup.find_all(["a", "b"])

## True
    True 可以匹配任何值,下面代码查找到所有的tag,但是不会返回字符串节点
    for tag in soup.find_all(True):
        print(tag.name)

## 方法
    如果没有合适过滤器,那么还可以定义一个方法,方法只接受一个元素参数 [4] ,如果这个方法返回 True 表示当前元素匹配并且被找到,如果不是则反回 False
    
    下面方法校验了当前元素,如果包含 class 属性却不包含 id 属性,那么将返回 True:
        def has_class_but_no_id(tag):
            return tag.has_attr('class') and not tag.has_attr('id')

将这个方法作为参数传入 find_all() 方法,将得到所有标签:
 soup.find_all(has_class_but_no_id)

通过一个方法来过滤一类标签属性的时候, 这个方法的参数是要被过滤的属性的值, 而不是这个标签. 下面的例子是找出 href 属性不符合指定正则的 a 标签.
        def not_lacie(href):
                return href and not re.compile("lacie").search(href)

标签过滤方法可以使用复杂方法. 下面的例子可以过滤出前后都有文字的标签.
        from bs4 import NavigableString
        def surrounded_by_strings(tag):
            return (isinstance(tag.next_element, NavigableString) and isinstance(tag.previous_element, NavigableString))

## find( name , attrs , recursive , string , **kwargs )
find_all( name , attrs , recursive , string , **kwargs )
find_parents( name , attrs , recursive , string , **kwargs )
find_parent( name , attrs , recursive , string , **kwargs )
find_next_siblings( name , attrs , recursive , string , **kwargs )
find_next_sibling( name , attrs , recursive , string , **kwargs )
find_previous_siblings( name , attrs , recursive , string , **kwargs )
find_previous_sibling( name , attrs , recursive , string , **kwargs )
find_all_next( name , attrs , recursive , string , **kwargs )
find_next( name , attrs , recursive , string , **kwargs )
find_all_previous( name , attrs , recursive , string , **kwargs )
find_previous( name , attrs , recursive , string , **kwargs )

soup.find_all("title")
    soup.find_all("p", "title")
    soup.find_all(id="link2")
    soup.find(string=re.compile("sisters"))
    soup.find_all(id=True) #在文档树中查找所有包含 id 属性的tag,无论 id 的值是什么
    soup.find_all(href=re.compile("elsie"), id='link1') #使用多个指定名字的参数可以同时过滤tag的多个属性
    data_soup.find_all(attrs={"data-foo": "value"}) #HTML5中的 data-* 属性不能搜索，但可以通过attrs参数定义一个字典参数来搜索包含特殊属性的tag

按CSS搜索
        soup.find_all("a", class_="sister")
        soup.find_all(class_=re.compile("itl"))
        css_soup.find_all("p", class_="strikeout")
        css_soup.find_all("p", class_="body strikeout")

string 参数：搜文档中的字符串内容.
        soup.find_all(string="Elsie")
        soup.find_all(string=["Tillie", "Elsie", "Lacie"])
        soup.find_all(string=re.compile("Dormouse"))
        soup.find_all("a", string="Elsie")
        
    limit参数：限制了回数量
        soup.find_all("a", limit=2)
      
    recursive 参数：只想搜索tag的直接子节点
        soup.html.find_all("title", recursive=False)

## CSS选择器
    soup.select("title")
    soup.select("body a")
    soup.select("head > title")
    soup.select("p > a")
    soup.select("p > a:nth-of-type(2)")
    soup.select("p > #link1")
    soup.select("#link1 ~ .sister") #找到兄弟节点标签
    soup.select("#link1 + .sister") #找到兄弟节点标签 
    soup.select(".sister")
    soup.select("[class~=sister]")
    soup.select("#link1")
    soup.select("a#link2")
    soup.select("#link1,#link2") #同时用多种CSS选择器查询元素
    soup.select('a[href]') #通过是否存在某个属性来查找
    soup.select('a[href="http://example.com/elsie"]') #通过属性的值来查找
    multilingual_soup.select('p[lang|=en]') #通过语言设置来查找
    soup.select_one(".sister") #返回查找到的元素的第一个

## 修改文档树    
    tag['class'] = 'verybold'     #修改tag的名称和属性
    tag.string = "New link text." #修改.string
    soup.a.append("Bar")          #想tag中添加内容
    tag.insert(1, "but did not endorse ")
    soup.b.string.insert_before(tag)
    soup.b.i.insert_after(soup.new_string(" ever "))
    tag.clear()              #移除当前tag的内容
    i_tag = soup.i.extract() #方法将当前tag移除文档树,并作为方法结果返回
    soup.i.decompose()       #将当前节点移除文档树并完全销毁
    a_tag.i.replace_with(new_tag) #移除文档树中的某段内容,并用新tag或文本节点替代它
    soup.p.wrap(soup.new_tag("div")) #对指定的tag元素进行包装 ,并返回包装后的结果:
    a_tag.i.unwrap() #与 wrap() 方法相反.将移除tag内的所有tag标签,该方法常被用来进行标记的解包

顶部

Python爬虫

目录

相关推荐