Python请求伪装案例实现?

wen python案例 1

本文目录导读:

  1. 基本请求伪装
  2. 高级伪装技巧
  3. 综合伪装案例
  4. 检测是否被识别
  5. 注意事项

我来为你介绍Python中请求伪装(Request Spoofing)的几种常见实现方式。

基本请求伪装

1 伪装User-Agent

import requests
from fake_useragent import UserAgent
# 方法1:使用fake_useragent库
ua = UserAgent()
headers = {
    'User-Agent': ua.random,
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'Accept-Encoding': 'gzip, deflate',
}
response = requests.get('https://httpbin.org/headers', headers=headers)
print(response.json())
# 方法2:使用随机User-Agent
import random
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15',
    'Mozilla/5.0 (iPad; CPU OS 14_0 like Mac OS X) AppleWebKit/605.1.15',
]
headers = {
    'User-Agent': random.choice(user_agents)
}
response = requests.get('https://httpbin.org/headers', headers=headers)

2 伪装IP地址(使用代理)

import requests
import random
# 免费代理示例(实际使用时需要有效的代理)
proxies_list = [
    {'http': 'http://proxy1:8080', 'https': 'http://proxy1:8080'},
    {'http': 'http://proxy2:3128', 'https': 'http://proxy2:3128'},
]
# 随机选择代理
proxy = random.choice(proxies_list)
try:
    response = requests.get(
        'https://httpbin.org/ip',
        proxies=proxy,
        timeout=10
    )
    print(f"IP信息: {response.json()}")
except Exception as e:
    print(f"代理失效: {e}")
# 使用Tor代理(需要安装Tor)
def use_tor_proxy():
    session = requests.Session()
    session.proxies = {
        'http': 'socks5h://127.0.0.1:9050',
        'https': 'socks5h://127.0.0.1:9050'
    }
    return session

高级伪装技巧

1 完整的浏览器指纹伪装

import requests
from fake_useragent import UserAgent
import random
class AdvancedSpoofer:
    def __init__(self):
        self.session = requests.Session()
        self.ua = UserAgent()
    def get_headers(self):
        return {
            'User-Agent': self.ua.random,
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': random.choice([
                'zh-CN,zh;q=0.9,en;q=0.8',
                'en-US,en;q=0.9,zh-CN;q=0.8',
                'zh-TW,zh;q=0.9,en;q=0.8'
            ]),
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Cache-Control': 'max-age=0',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-User': '?1',
        }
    def add_referer(self, url):
        """添加来源URL伪装"""
        referers = [
            'https://www.google.com/search?q=python',
            'https://www.baidu.com/s?wd=requests',
            'https://github.com/search?q=python+spider',
            'https://stackoverflow.com/questions/tagged/python'
        ]
        return random.choice(referers)
    def request_with_spoof(self, url, method='GET'):
        headers = self.get_headers()
        if random.random() > 0.5:
            headers['Referer'] = self.add_referer(url)
        # 随机延迟,模拟人类行为
        import time
        time.sleep(random.uniform(0.5, 3))
        if method == 'GET':
            return self.session.get(url, headers=headers)
        else:
            return self.session.post(url, headers=headers)
# 使用示例
spoofer = AdvancedSpoofer()
response = spoofer.request_with_spoof('https://httpbin.org/headers')
print(response.json())

2 Cookie伪装

import requests
from selenium import webdriver
class CookieSpoofer:
    def __init__(self):
        self.session = requests.Session()
    def extract_cookies_from_browser(self):
        """从浏览器提取cookies(使用selenium)"""
        driver = webdriver.Chrome()
        driver.get('https://example.com')
        cookies = driver.get_cookies()
        for cookie in cookies:
            self.session.cookies.set(cookie['name'], cookie['value'])
        driver.quit()
        return self.session
    def manual_cookie_spoof(self, cookies_dict):
        """手动设置伪装cookies"""
        for name, value in cookies_dict.items():
            self.session.cookies.set(name, value)
        return self.session
    def simulate_login_cookies(self):
        """模拟登录后的cookies"""
        # 常见的登录cookies结构
        fake_cookies = {
            'session_id': ''.join(random.choices('abcdef0123456789', k=32)),
            'csrf_token': ''.join(random.choices('abcdefghijklmnopqrstuvwxyz', k=32)),
            'login_status': 'true',
            'user_role': 'normal_user',
        }
        return self.manual_cookie_spoof(fake_cookies)
# 使用示例
cookie_spoofer = CookieSpoofer()
cookie_spoofer.simulate_login_cookies()
response = cookie_spoofer.session.get('https://httpbin.org/cookies')
print(response.json())

综合伪装案例

1 爬虫伪装中间件

import requests
import random
import time
from collections import deque
class SpiderMiddleware:
    def __init__(self):
        self.user_agents = deque([
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36',
        ])
        self.proxies = deque([
            {'http': 'http://proxy1:8080'},
            {'http': 'http://proxy2:8080'},
        ])
        self.session = requests.Session()
    def get_next_user_agent(self):
        """轮流使用User-Agent"""
        self.user_agents.rotate(-1)
        return self.user_agents[0]
    def get_next_proxy(self):
        """轮流使用代理"""
        self.proxies.rotate(-1)
        return self.proxies[0]
    def request(self, url, retry=3):
        """带重试机制的伪装请求"""
        for attempt in range(retry):
            try:
                headers = {
                    'User-Agent': self.get_next_user_agent(),
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
                }
                proxy = self.get_next_proxy() if self.proxies else None
                response = self.session.get(
                    url, 
                    headers=headers,
                    proxies=proxy,
                    timeout=10
                )
                if response.status_code == 200:
                    return response
                elif response.status_code == 403:
                    # 被禁止,更换IP
                    if self.proxies:
                        continue
                elif response.status_code == 429:
                    # 频率限制,等待
                    time.sleep(random.uniform(5, 10))
                    continue
            except Exception as e:
                print(f"请求失败 (尝试 {attempt+1}/{retry}): {e}")
                time.sleep(random.uniform(1, 3))
        return None
# 使用示例
middleware = SpiderMiddleware()
response = middleware.request('https://httpbin.org/headers')

2 完整的伪装请求封装

import requests
import random
import time
import hashlib
class CompleteSpoofingExample:
    """完整的请求伪装示例"""
    @staticmethod
    def generate_fingerprint():
        """生成浏览器指纹"""
        fingerprint = {
            'screen_resolution': f"{random.choice([1920, 1366, 1440, 1536])}x{random.choice([1080, 768, 900, 864])}",
            'timezone': random.choice(['Asia/Shanghai', 'America/New_York', 'Europe/London']),
            'platform': random.choice(['Win32', 'MacIntel', 'Linux x86_64']),
            'languages': random.choice([['zh-CN', 'zh', 'en'], ['en-US', 'en'], ['zh-TW', 'zh', 'en']]),
            'device_memory': random.choice([4, 8, 16]),
            'hardware_concurrency': random.choice([4, 8, 12, 16])
        }
        return fingerprint
    @staticmethod
    def generate_session_hash():
        """生成会话哈希"""
        timestamp = str(time.time())
        random_str = ''.join(random.choices('abcdefghijklmnopqrstuvwxyz0123456789', k=16))
        return hashlib.md5(f"{timestamp}{random_str}".encode()).hexdigest()
    def spoofed_request(self, url, method='GET', data=None):
        """执行伪装请求"""
        fingerprint = self.generate_fingerprint()
        session_hash = self.generate_session_hash()
        headers = {
            'User-Agent': f"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'DNT': '1',  # Do Not Track
            'Upgrade-Insecure-Requests': '1',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-User': '?1',
            'X-Requested-With': 'XMLHttpRequest',
            'X-Session-Id': session_hash,
            'X-Device-Info': str(fingerprint)
        }
        session = requests.Session()
        try:
            if method == 'GET':
                response = session.get(url, headers=headers, timeout=15)
            else:
                response = session.post(url, headers=headers, data=data, timeout=15)
            # 模拟浏览器行为
            time.sleep(random.uniform(0.1, 0.5))
            return response
        except requests.exceptions.RequestException as e:
            print(f"请求失败: {e}")
            return None
# 使用示例
if __name__ == "__main__":
    spoof = CompleteSpoofingExample()
    result = spoof.spoofed_request('https://httpbin.org/anything')
    if result:
        print(f"状态码: {result.status_code}")
        print(f"响应内容: {result.text[:200]}")

检测是否被识别

def check_if_detected(response):
    """检测是否被识别为爬虫"""
    # 检查响应头
    headers = response.headers
    indicators = []
    if 'X-Block-Status' in headers and headers['X-Block-Status'] == 'blocked':
        indicators.append('被明确阻止')
    if response.status_code == 403:
        indicators.append('返回403禁止访问')
    if 'captcha' in response.text.lower() or '验证码' in response.text:
        indicators.append('触发了验证码')
    if response.status_code == 429:
        indicators.append('请求频率限制')
    # 检查是否返回了空数据或假数据
    if len(response.text) == 0:
        indicators.append('返回空数据')
    if indicators:
        print(f"检测到被识别: {', '.join(indicators)}")
        return True
    else:
        print("请求正常,未被识别")
        return False

注意事项

  1. 合法性:请遵守网站的robots.txt和使用条款
  2. 频率控制:添加适当的延迟,避免对服务器造成压力
  3. 代理质量:免费代理通常不稳定,建议使用付费代理服务
  4. Cookie管理:某些网站会验证cookie的有效性
  5. :对于JavaScript渲染的页面,可能需要使用Selenium等工具

这些伪装技术应该合理使用,主要用于合法的数据采集和测试目的。

标签: 请求头

抱歉,评论功能暂时关闭!