Requests库简介
Requests是Python中最流行的HTTP客户端库,具有以下特点:
- 人性化的API设计
- 支持HTTP连接保持
- 自动内容解码
- 支持文件上传下载
- 社区活跃,文档完善
安装它也很简单:
pip install requests
基础请求方法
GET请求基础
import requests
# 基本GET请求
response = requests.get('https://baidu.com')
print(response.status_code) # 200
print(response.text) # 查看响应内容
带参数的GET请求
# 查询参数设置
params = {
'key1': 'value1',
'key2': ['value2', 'value3']
}
response = requests.get(
'https://httpbin.org/get',
params=params
)
print(response.url) # 查看实际请求URL
# https://httpbin.org/get?key1=value1&key2=value2&key2=value3
请求头设置与反爬绕过
自定义请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Referer': 'https://www.google.com/',
'Cookie': 'session_id=abc123;'
}
response = requests.get(
'https://httpbin.org/headers',
headers=headers
)
print(response.json()) # 查看服务器接收到的请求头
随机User-Agent实战
from fake_useragent import UserAgent
ua = UserAgent()
random_header = {
'User-Agent': ua.random
}
response = requests.get(
'https://httpbin.org/user-agent',
headers=random_header
)
print(response.json()) # 显示当前使用的User-Agent
记得安装fake_useragent:
pip install fake_useragent
POST请求与表单提交
表单数据提交
login_data = {
'username': 'admin',
'password': 'secret'
}
response = requests.post(
'https://httpbin.org/post',
data=login_data
)
print(response.json()['form']) # 查看表单数据
JSON数据提交
payload = {
"name": "John",
"age": 30,
"city": "New York"
}
response = requests.post(
'https://httpbin.org/post',
json=payload
)
print(response.json()['json']) # 查看接收的JSON数据
文件上传实战
files = {'file': open('test.jpg', 'rb')}
response = requests.post(
'https://httpbin.org/post',
files=files
)
print(response.json()['files']) # 显示上传文件信息
会话保持与Cookie管理
使用Session保持会话
with requests.Session() as s:
# 首次登录获取Cookie
s.post('https://example.com/login', data={'user':'test'})
# 后续请求自动携带Cookie
response = s.get('https://example.com/dashboard')
print(response.text)
手动处理Cookie
response = requests.get('https://www.example.com')
received_cookies = response.cookies
# 携带Cookie发送请求
response = requests.get(
'https://www.example.com/profile',
cookies=received_cookies
)
异常处理与超时设置
基础异常处理
try:
response = requests.get('https://www.example.com', timeout=5)
response.raise_for_status() # 自动抛出HTTP错误
except requests.exceptions.Timeout:
print("请求超时")
except requests.exceptions.HTTPError as err:
print(f"HTTP错误: {err}")
except requests.exceptions.RequestException as err:
print(f"请求异常: {err}")
重试机制实现
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
session = requests.Session()
retries = Retry(
total=3,
backoff_factor=0.3,
status_forcelist=[500, 502, 503, 504]
)
session.mount(‘https://’, HTTPAdapter(max_retries=retries))
response = session.get(‘https://unstable-site.com/api’)
SSL验证与代理设置
禁用SSL验证
# 适用于测试环境
response = requests.get(
'https://self-signed-cert-site.com',
verify=False # 关闭证书验证
)
使用代理
proxies = {
'http': 'http://10.10.1.10:3128',
'https': 'http://10.10.1.10:1080',
}
response = requests.get(
'https://httpbin.org/ip',
proxies=proxies
)
print(response.json()) # 显示代理服务器IP
实例代码
根据以上对requests的了解,我们可以为自己构建适合自己的爬取函数,并放在指定py文件中,需要使用时引入即可,无需重复再写requests,如:
'''生成user-agent保存到user_agent.txt'''
def sc_user_agent():
ua = UserAgent()
user_agent_list = []
for _ in range(1000):
user_agent_list.append(ua.random)
for ua in user_agent_list:
with open('user_agent.txt','a+',encoding='utf-8')as f:
f.write(ua)
f.write('\n')
'''随机代理'''
def get_random_ip():
with open('ip.txt','r')as f:
data=f.read()
ip_lst=data.split('\n')[0:-1]
ip=random.choice(ip_lst)
return ip
def get_random_useragent():
with open('user_agent.txt','r')as f:
data=f.read()
ua_lst=data.split('\n')[0:-1]
ua=random.choice(ua_lst)
return ua
'''本局调用'''
def get_requests(url, headers, cookies, proxies, timeout):
max_retries = 5 # 最大重试次数
retry_count = 0 # 当前重试次数
while retry_count < max_retries:
try:
# 设置Headers
if headers == 1:
user_agent = get_random_useragent()
headers = {
"User-Agent": user_agent
}
else:
headers = None
# 设置Cookies
if cookies == 1:
cookies = None
else:
cookies = None
# 设置代理
if proxies == 1:
ip = get_random_ip()
proxies = {
"http": f"http://{ip}",
}
else:
proxies = None
# 发送GET请求,并带上所有设置的参数
response = requests.get(url, headers=headers, cookies=cookies, proxies=proxies, timeout=timeout, stream=True)
# 检查响应状态码,如果是200表示访问成功
if response.status_code == 200:
return response
else:
print(f"访问失败,状态码:{response.status_code}")
except requests.exceptions.RequestException as e:
print(f"请求过程中出现错误:{e}")
# 请求失败后等待5秒再重试
retry_count += 1
if retry_count < max_retries:
print(f"请求失败,第 {retry_count} 次重试...")
time.sleep(5)
# 如果超过最大重试次数仍未成功,返回None
print(f"请求失败,已超过最大重试次数 {max_retries} 次")
return None