参考

了解网页结构

了解传输协议

数据交互的约定

http

  • 常用请求头
    • User-Agent:请求载体的身份标识
    • Connection:请求完毕后,是断开连接还是保持连接
  • 常用响应头信息
    • Content-Type

https

  • 安全的超文本传输协议
  • 加密方式
    • 对称密钥加密
    • 非对称密钥加密
    • 证书密钥加密

正则表达式选取

#!/usr/bin/env python3
# -*- coding:utf-8 -*-

from urllib.request import urlopen
import re

# 打开网页获取网页的html,因为中文读取需要utf-8
html = urlopen("https://bsheepcoder.github.io/2022/02/18/Fe_CSS1/").read().decode('utf-8')
print(html)

# 正则表达式选取
res = re.findall(r"<title>(.+?)</title>", html)

print(res)

Beautiful soup

  • 简化爬取的过程
  • 高级匹配
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re

# 打开网页获取网页的html,因为中文读取需要utf-8
html = urlopen("https://bsheepcoder.github.io/").read().decode('utf-8')

# 选择使用lxml的解析
soup = BeautifulSoup(html, "lxml")

# 在soup中找到a的对象,得到对象中的链接
for item in soup.select('a'):
detail_url = item.get('href')
detail_url = str(detail_url)
if detail_url[0:4] == "http":
print(detail_url)

# 输出
https://github.com/Bsheepcoder
https://github.com/Bsheepcoder
https://hexo.io
https://github.com/jerryc127/hexo-theme-butterfly

CSS爬取

  • 通过class争对css标签选取,获取内容
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re

# 打开网页获取网页的html,因为中文读取需要utf-8
html = urlopen("https://bsheepcoder.github.io/").read().decode('utf-8')

# 选择使用lxml的解析
soup = BeautifulSoup(html, "lxml")

month = soup.find_all('a', {'class': 'article-title'})
for i in month:
print(i.get_text())

suop+正则

  • 爬取特定对象的所有链接
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re

# 打开网页获取网页的html,因为中文读取需要utf-8
html = urlopen("https://www.csdn.net/?spm=1005.2025.3001.4476").read().decode('utf-8')

# 选择使用lxml的解析
soup = BeautifulSoup(html, "lxml")

month = soup.find_all('img', {'src': re.compile('.*?\.jpg')})
for i in month:
print(i['src'])
  • 爬取特定地址的网页
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re

# 打开网页获取网页的html,因为中文读取需要utf-8
html = urlopen("https://www4.bing.com/search?q=find_all").read().decode('utf-8')

# 选择使用lxml的解析
soup = BeautifulSoup(html, "lxml")

month = soup.find_all('a', {'href': re.compile('https://www.cnblogs.com/*')})
for i in month:
print(i['href'])

多功能Requests

  • Post
  • get
import requests

param = {"wd": '莫烦python'}
r = requests.get('http://www.baidu.com/s', params=param)
print(r.url)
import requests

data = {'username': '', 'password': ''}
r = requests.post(
'https://webvpn.scuec.edu.cn/users/sign_in', data=data
)
print(r.text)

file = {'uploadFile': open('./image.png', 'rb')}
r = requests.post(
'http://pythonscraping.com/pages/files/processing2.php', files=file
)
print(r.text)

下载图片练习

#!/usr/bin/env python3
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import requests
import os

html = requests.get("URL").text
soup = BeautifulSoup(html, 'lxml')
img_ul = soup.find_all('img', {"class": "post-thumb"})

print(len(img_ul))

# 创建文件夹
os.makedirs('./img/', exist_ok=True)

# 下载
for ul in img_ul:
url = ul['src']
r = requests.get(url, stream=True)
image_name = url.split('/')[-1]
with open('./img/%s' % image_name ,'wb') as f:
for chunk in r.iter_content(chunk_size=128):
f.write(chunk)
print('Save %s' % image_name)

广泛爬图

#!/usr/bin/env python3
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import re
import requests
import os

# UA检测
# UA伪装: 在hearder中添加User-Agent
# 参数设置

# 参数设置
url1 = '主URL'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 '
'Safari/537.36 '
}
# 发起请求
response_html = requests.get(url=url1, headers=headers, verify=False).text
# print(response_html)
for i in range(2, 70):
# 创建文件夹
os.makedirs('./img/', exist_ok=True)

# 获取图片页链接
soup = BeautifulSoup(response_html, 'lxml')
img_url = soup.find_all('a', {"class": "featured-img-box"})
print("该页有链接{}个图集".format(len(img_url)))
for iu in img_url:
url_img = iu['href']
response_imgPage = requests.get(url=url_img, headers=headers).text
imgPage_soup = BeautifulSoup(response_imgPage, 'lxml')
img_link = imgPage_soup.find_all('img', {"title": "source: imgur.com"})
for il in img_link:
link = il['src']
print('图片链接:', link)
# 使用get请求,得到对应的链接的资源
if link[0:9] == 'https://i':
break
r = requests.get(link, stream=True, verify=False)
image_name = link.split('/')[-1]
with open('./img/%s' % image_name, 'wb') as f:
for chunk in r.iter_content(chunk_size=128):
f.write(chunk)
print('Save %s' % image_name)
ul2 = '子URL' + str(i)
response_html = requests.get(url=ul2, headers=headers).text

多进程分布式

  • 利用多核cpu,cmd 输入 devmgmt.msc 查看cpu自己是几何的
  • Python多线程有全局锁,推荐多进程