笔记 · 2025-02-23

Python爬虫核心请求库:Requests实战指南

Requests库简介

Requests是Python中最流行的HTTP客户端库,具有以下特点:

  • 人性化的API设计
  • 支持HTTP连接保持
  • 自动内容解码
  • 支持文件上传下载
  • 社区活跃,文档完善

安装它也很简单:

pip install requests

基础请求方法

GET请求基础

import requests

# 基本GET请求
response = requests.get('https://baidu.com')
print(response.status_code)  # 200
print(response.text)  # 查看响应内容

带参数的GET请求

# 查询参数设置
params = {
    'key1': 'value1',
    'key2': ['value2', 'value3']
}

response = requests.get(
    'https://httpbin.org/get',
    params=params
)

print(response.url)  # 查看实际请求URL
# https://httpbin.org/get?key1=value1&key2=value2&key2=value3

请求头设置与反爬绕过

自定义请求头

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    'Referer': 'https://www.google.com/',
    'Cookie': 'session_id=abc123;'
}

response = requests.get(
    'https://httpbin.org/headers',
    headers=headers
)

print(response.json())  # 查看服务器接收到的请求头

随机User-Agent实战

from fake_useragent import UserAgent

ua = UserAgent()
random_header = {
    'User-Agent': ua.random
}

response = requests.get(
    'https://httpbin.org/user-agent',
    headers=random_header
)

print(response.json())  # 显示当前使用的User-Agent

记得安装fake_useragent:

pip install fake_useragent

POST请求与表单提交

表单数据提交

login_data = {
    'username': 'admin',
    'password': 'secret'
}

response = requests.post(
    'https://httpbin.org/post',
    data=login_data
)

print(response.json()['form'])  # 查看表单数据

JSON数据提交

payload = {
    "name": "John",
    "age": 30,
    "city": "New York"
}

response = requests.post(
    'https://httpbin.org/post',
    json=payload
)

print(response.json()['json'])  # 查看接收的JSON数据

文件上传实战

files = {'file': open('test.jpg', 'rb')}

response = requests.post(
    'https://httpbin.org/post',
    files=files
)

print(response.json()['files'])  # 显示上传文件信息

会话保持与Cookie管理

使用Session保持会话

with requests.Session() as s:
    # 首次登录获取Cookie
    s.post('https://example.com/login', data={'user':'test'})
    
    # 后续请求自动携带Cookie
    response = s.get('https://example.com/dashboard')
    print(response.text)

手动处理Cookie

response = requests.get('https://www.example.com')
received_cookies = response.cookies

# 携带Cookie发送请求
response = requests.get(
    'https://www.example.com/profile',
    cookies=received_cookies
)

异常处理与超时设置

基础异常处理

try:
    response = requests.get('https://www.example.com', timeout=5)
    response.raise_for_status()  # 自动抛出HTTP错误
except requests.exceptions.Timeout:
    print("请求超时")
except requests.exceptions.HTTPError as err:
    print(f"HTTP错误: {err}")
except requests.exceptions.RequestException as err:
    print(f"请求异常: {err}")

重试机制实现

from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

session = requests.Session()
retries = Retry(
total=3,
backoff_factor=0.3,
status_forcelist=[500, 502, 503, 504]
)
session.mount(‘https://’, HTTPAdapter(max_retries=retries))

response = session.get(‘https://unstable-site.com/api’)

SSL验证与代理设置

禁用SSL验证

# 适用于测试环境
response = requests.get(
    'https://self-signed-cert-site.com',
    verify=False  # 关闭证书验证
)

使用代理

proxies = {
    'http': 'http://10.10.1.10:3128',
    'https': 'http://10.10.1.10:1080',
}

response = requests.get(
    'https://httpbin.org/ip',
    proxies=proxies
)
print(response.json())  # 显示代理服务器IP

实例代码

根据以上对requests的了解,我们可以为自己构建适合自己的爬取函数,并放在指定py文件中,需要使用时引入即可,无需重复再写requests,如:

'''生成user-agent保存到user_agent.txt'''
def sc_user_agent():
    ua = UserAgent()

    user_agent_list = []
    for _ in range(1000):
        user_agent_list.append(ua.random)

    for ua in user_agent_list:
        with open('user_agent.txt','a+',encoding='utf-8')as f:
            f.write(ua)
            f.write('\n')
            
            
'''随机代理'''    
def get_random_ip():
    with open('ip.txt','r')as f:
        data=f.read()
        ip_lst=data.split('\n')[0:-1]
        ip=random.choice(ip_lst)
        return ip
            
def get_random_useragent():
    with open('user_agent.txt','r')as f:
        data=f.read()
        ua_lst=data.split('\n')[0:-1]
        ua=random.choice(ua_lst)
        return ua
            

'''本局调用'''

def get_requests(url, headers, cookies, proxies, timeout):
    max_retries = 5  # 最大重试次数
    retry_count = 0  # 当前重试次数

    while retry_count < max_retries:
        try:
            # 设置Headers
            if headers == 1:
                user_agent = get_random_useragent()
                headers = {
                    "User-Agent": user_agent
                }
            else:
                headers = None

            # 设置Cookies
            if cookies == 1:
                cookies = None
            else:
                cookies = None

            # 设置代理
            if proxies == 1:
                ip = get_random_ip()
                proxies = {
                    "http": f"http://{ip}",
                }
            else:
                proxies = None

            # 发送GET请求,并带上所有设置的参数
            response = requests.get(url, headers=headers, cookies=cookies, proxies=proxies, timeout=timeout, stream=True)

            # 检查响应状态码,如果是200表示访问成功
            if response.status_code == 200:
                return response
            else:
                print(f"访问失败,状态码:{response.status_code}")

        except requests.exceptions.RequestException as e:
            print(f"请求过程中出现错误:{e}")

        # 请求失败后等待5秒再重试
        retry_count += 1
        if retry_count < max_retries:
            print(f"请求失败,第 {retry_count} 次重试...")
            time.sleep(5)

    # 如果超过最大重试次数仍未成功,返回None
    print(f"请求失败,已超过最大重试次数 {max_retries} 次")
    return None
目录