Python爬虫入门

参考

莫烦Python爬虫 https://mofanpy.com/tutorials/data-manipulation/scraping/why
Beautiful Soup https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/

了解网页结构

可以参考我blog前端的部分
- HTML https://bsheepcoder.github.io/2022/02/18/Fe_HTML1/
- CSS https://bsheepcoder.github.io/2022/02/18/Fe_CSS1/
大多数爬的是body内的信息

了解传输协议

数据交互的约定

http

常用请求头
- User-Agent：请求载体的身份标识
- Connection：请求完毕后，是断开连接还是保持连接
常用响应头信息
- Content-Type

https

安全的超文本传输协议
加密方式
- 对称密钥加密
- 非对称密钥加密
- 证书密钥加密

正则表达式选取

#!/usr/bin/env python3
# -*- coding:utf-8 -*-

from urllib.request import urlopen
import re

# 打开网页获取网页的html,因为中文读取需要utf-8
html = urlopen("https://bsheepcoder.github.io/2022/02/18/Fe_CSS1/").read().decode('utf-8')
print(html)

# 正则表达式选取
res = re.findall(r"<title>(.+?)</title>", html)

print(res)

Beautiful soup

简化爬取的过程
高级匹配

#!/usr/bin/env python3
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re

# 打开网页获取网页的html,因为中文读取需要utf-8
html = urlopen("https://bsheepcoder.github.io/").read().decode('utf-8')

# 选择使用lxml的解析
soup = BeautifulSoup(html, "lxml")

# 在soup中找到a的对象，得到对象中的链接
for item in soup.select('a'):
    detail_url = item.get('href')
    detail_url = str(detail_url)
    if detail_url[0:4] == "http":
        print(detail_url)

# 输出
https://github.com/Bsheepcoder
https://github.com/Bsheepcoder
https://hexo.io
https://github.com/jerryc127/hexo-theme-butterfly

CSS爬取

通过class争对css标签选取，获取内容

#!/usr/bin/env python3
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re

# 打开网页获取网页的html,因为中文读取需要utf-8
html = urlopen("https://bsheepcoder.github.io/").read().decode('utf-8')

# 选择使用lxml的解析
soup = BeautifulSoup(html, "lxml")

month = soup.find_all('a', {'class': 'article-title'})
for i in month:
    print(i.get_text())

suop+正则

爬取特定对象的所有链接

#!/usr/bin/env python3
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re

# 打开网页获取网页的html,因为中文读取需要utf-8
html = urlopen("https://www.csdn.net/?spm=1005.2025.3001.4476").read().decode('utf-8')

# 选择使用lxml的解析
soup = BeautifulSoup(html, "lxml")

month = soup.find_all('img', {'src': re.compile('.*?\.jpg')})
for i in month:
    print(i['src'])

爬取特定地址的网页

#!/usr/bin/env python3
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re

# 打开网页获取网页的html,因为中文读取需要utf-8
html = urlopen("https://www4.bing.com/search?q=find_all").read().decode('utf-8')

# 选择使用lxml的解析
soup = BeautifulSoup(html, "lxml")

month = soup.find_all('a', {'href': re.compile('https://www.cnblogs.com/*')})
for i in month:
    print(i['href'])

多功能Requests

Post
get

import requests

param = {"wd": '莫烦python'}
r = requests.get('http://www.baidu.com/s', params=param)
print(r.url)

import requests

data = {'username': '', 'password': ''}
r = requests.post(
    'https://webvpn.scuec.edu.cn/users/sign_in', data=data
)
print(r.text)

file = {'uploadFile': open('./image.png', 'rb')}
r = requests.post(
    'http://pythonscraping.com/pages/files/processing2.php', files=file
)
print(r.text)

下载图片练习

#!/usr/bin/env python3
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import requests
import os

html = requests.get("URL").text
soup = BeautifulSoup(html, 'lxml')
img_ul = soup.find_all('img', {"class": "post-thumb"})

print(len(img_ul))

# 创建文件夹
os.makedirs('./img/', exist_ok=True)

# 下载
for ul in img_ul:
    url = ul['src']
    r = requests.get(url, stream=True)
    image_name = url.split('/')[-1]
    with open('./img/%s' % image_name ,'wb') as f:
        for chunk in r.iter_content(chunk_size=128):
            f.write(chunk)
    print('Save %s' % image_name)

广泛爬图

#!/usr/bin/env python3
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import re
import requests
import os

# UA检测
# UA伪装: 在hearder中添加User-Agent
# 参数设置

# 参数设置
url1 = '主URL'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 '
                  'Safari/537.36 '
}
# 发起请求
response_html = requests.get(url=url1, headers=headers, verify=False).text
# print(response_html)
for i in range(2, 70):
    # 创建文件夹
    os.makedirs('./img/', exist_ok=True)

    # 获取图片页链接
    soup = BeautifulSoup(response_html, 'lxml')
    img_url = soup.find_all('a', {"class": "featured-img-box"})
    print("该页有链接{}个图集".format(len(img_url)))
    for iu in img_url:
        url_img = iu['href']
        response_imgPage = requests.get(url=url_img, headers=headers).text
        imgPage_soup = BeautifulSoup(response_imgPage, 'lxml')
        img_link = imgPage_soup.find_all('img', {"title": "source: imgur.com"})
        for il in img_link:
            link = il['src']
            print('图片链接:', link)
            # 使用get请求，得到对应的链接的资源
            if link[0:9] == 'https://i':
                break
            r = requests.get(link, stream=True, verify=False)
            image_name = link.split('/')[-1]
            with open('./img/%s' % image_name, 'wb') as f:
                for chunk in r.iter_content(chunk_size=128):
                    f.write(chunk)
            print('Save %s' % image_name)
    ul2 = '子URL' + str(i)
    response_html = requests.get(url=ul2, headers=headers).text

多进程分布式

利用多核cpu，cmd 输入 devmgmt.msc 查看cpu自己是几何的
Python多线程有全局锁，推荐多进程