PyQuery的基本使用

PyQuery库是一个非常强大又灵活的网页解析库,如果你学习过jquery的话,那么PyQuery就是你非常绝佳的选择

初始化解析内容

1.字符串初始化

html = '''
<div>
    <ul>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('ul'))

输出结果:

<ul>
    <li class="item-0">first item</li>
    <li class="item-1"><a href="link2.html">second item</a></li>
    <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
    <li class="item-1 active"><a href="link4.html">fourth item</a></li>
    <li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>

2.URL初始化

from pyquery import PyQuery as pq
doc = pq(url='http://www.baidu.com')
print(doc('head'))

3.文件初始化

from pyquery import PyQuery as pq
doc = pq(filename='index.html')
print(doc('head'))

基本CSS选择器

jquery的元素选择器

html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('#container .list li')) 

查找元素

1.获取子元素

html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list') #获取到的是class="list"的ul标签元素
print(items)

lis = items.find('li') #获取到的是items内的li标签
print(lis)

2.获取父元素

html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
container = items.parent() #获取.list的所有父元素
print(container)


parent = items.parents('.wrap') #获取.list的指定父元素
print(parent)

3.获取兄弟元素

from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.list')
print(li.siblings()) #获取.list的兄弟元素

print(li.siblings('.active'))#获取.list的指定兄弟元素

获取信息

1.获取属性

from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')
print(a)

#两种方式取a下的href元素
print(a.attr('href'))
print(a.attr.href)

2.获取文本

from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')
print(a)

print(a.text())

3.获取HTML

from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')
print(a)

print(a.html())

DOM操作

1.获addClass、removeClass

from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.removeClass('active') 
print(li)
li.addClass('active')
print(li)

2.attr、css

html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.attr('name', 'link') #添加name属性
print(li)
li.css('font-size', '14px') #添加样式
print(li)
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>

<li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third item</span></a></li>

<li class="item-0 active" name="link" style="font-size: 14px"><a href="link3.html"><span class="bold">third item</span></a></li>

###3.remove

html = '''
<div class="wrap">
    Hello, World
    <p>This is a paragraph.</p>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
wrap = doc('.wrap')
print(wrap.text())
wrap.find('p').remove()
print(wrap.text())

输出结果

Hello, World This is a paragraph.
Hello, World

  转载请注明: Justin博客 PyQuery的基本使用

 上一篇
自动化Selenium的基本使用(一) 自动化Selenium的基本使用(一)
selenium 是一个用于Web应用程序测试的工具。Selenium测试直接运行在浏览器中,就像真正的用户在操作一样。支持的浏览器包括IE(7, 8, 9, 10, 11),Mozilla Firefox,Safari,Google Ch
2019-04-06
下一篇 
Linux防火墙操作 Linux防火墙操作
预备环境系统:centos6或者7 More info: Centos Centos防火墙操作###Centos6防火墙操作 $ sudo service firewalld stop #停止服务 $ sudo service fire
2019-04-02
  目录