本文目录导读:
我来为你介绍Python中请求伪装(Request Spoofing)的几种常见实现方式。
基本请求伪装
1 伪装User-Agent
import requests
from fake_useragent import UserAgent
# 方法1:使用fake_useragent库
ua = UserAgent()
headers = {
'User-Agent': ua.random,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate',
}
response = requests.get('https://httpbin.org/headers', headers=headers)
print(response.json())
# 方法2:使用随机User-Agent
import random
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36',
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15',
'Mozilla/5.0 (iPad; CPU OS 14_0 like Mac OS X) AppleWebKit/605.1.15',
]
headers = {
'User-Agent': random.choice(user_agents)
}
response = requests.get('https://httpbin.org/headers', headers=headers)
2 伪装IP地址(使用代理)
import requests
import random
# 免费代理示例(实际使用时需要有效的代理)
proxies_list = [
{'http': 'http://proxy1:8080', 'https': 'http://proxy1:8080'},
{'http': 'http://proxy2:3128', 'https': 'http://proxy2:3128'},
]
# 随机选择代理
proxy = random.choice(proxies_list)
try:
response = requests.get(
'https://httpbin.org/ip',
proxies=proxy,
timeout=10
)
print(f"IP信息: {response.json()}")
except Exception as e:
print(f"代理失效: {e}")
# 使用Tor代理(需要安装Tor)
def use_tor_proxy():
session = requests.Session()
session.proxies = {
'http': 'socks5h://127.0.0.1:9050',
'https': 'socks5h://127.0.0.1:9050'
}
return session
高级伪装技巧
1 完整的浏览器指纹伪装
import requests
from fake_useragent import UserAgent
import random
class AdvancedSpoofer:
def __init__(self):
self.session = requests.Session()
self.ua = UserAgent()
def get_headers(self):
return {
'User-Agent': self.ua.random,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': random.choice([
'zh-CN,zh;q=0.9,en;q=0.8',
'en-US,en;q=0.9,zh-CN;q=0.8',
'zh-TW,zh;q=0.9,en;q=0.8'
]),
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
}
def add_referer(self, url):
"""添加来源URL伪装"""
referers = [
'https://www.google.com/search?q=python',
'https://www.baidu.com/s?wd=requests',
'https://github.com/search?q=python+spider',
'https://stackoverflow.com/questions/tagged/python'
]
return random.choice(referers)
def request_with_spoof(self, url, method='GET'):
headers = self.get_headers()
if random.random() > 0.5:
headers['Referer'] = self.add_referer(url)
# 随机延迟,模拟人类行为
import time
time.sleep(random.uniform(0.5, 3))
if method == 'GET':
return self.session.get(url, headers=headers)
else:
return self.session.post(url, headers=headers)
# 使用示例
spoofer = AdvancedSpoofer()
response = spoofer.request_with_spoof('https://httpbin.org/headers')
print(response.json())
2 Cookie伪装
import requests
from selenium import webdriver
class CookieSpoofer:
def __init__(self):
self.session = requests.Session()
def extract_cookies_from_browser(self):
"""从浏览器提取cookies(使用selenium)"""
driver = webdriver.Chrome()
driver.get('https://example.com')
cookies = driver.get_cookies()
for cookie in cookies:
self.session.cookies.set(cookie['name'], cookie['value'])
driver.quit()
return self.session
def manual_cookie_spoof(self, cookies_dict):
"""手动设置伪装cookies"""
for name, value in cookies_dict.items():
self.session.cookies.set(name, value)
return self.session
def simulate_login_cookies(self):
"""模拟登录后的cookies"""
# 常见的登录cookies结构
fake_cookies = {
'session_id': ''.join(random.choices('abcdef0123456789', k=32)),
'csrf_token': ''.join(random.choices('abcdefghijklmnopqrstuvwxyz', k=32)),
'login_status': 'true',
'user_role': 'normal_user',
}
return self.manual_cookie_spoof(fake_cookies)
# 使用示例
cookie_spoofer = CookieSpoofer()
cookie_spoofer.simulate_login_cookies()
response = cookie_spoofer.session.get('https://httpbin.org/cookies')
print(response.json())
综合伪装案例
1 爬虫伪装中间件
import requests
import random
import time
from collections import deque
class SpiderMiddleware:
def __init__(self):
self.user_agents = deque([
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36',
])
self.proxies = deque([
{'http': 'http://proxy1:8080'},
{'http': 'http://proxy2:8080'},
])
self.session = requests.Session()
def get_next_user_agent(self):
"""轮流使用User-Agent"""
self.user_agents.rotate(-1)
return self.user_agents[0]
def get_next_proxy(self):
"""轮流使用代理"""
self.proxies.rotate(-1)
return self.proxies[0]
def request(self, url, retry=3):
"""带重试机制的伪装请求"""
for attempt in range(retry):
try:
headers = {
'User-Agent': self.get_next_user_agent(),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
}
proxy = self.get_next_proxy() if self.proxies else None
response = self.session.get(
url,
headers=headers,
proxies=proxy,
timeout=10
)
if response.status_code == 200:
return response
elif response.status_code == 403:
# 被禁止,更换IP
if self.proxies:
continue
elif response.status_code == 429:
# 频率限制,等待
time.sleep(random.uniform(5, 10))
continue
except Exception as e:
print(f"请求失败 (尝试 {attempt+1}/{retry}): {e}")
time.sleep(random.uniform(1, 3))
return None
# 使用示例
middleware = SpiderMiddleware()
response = middleware.request('https://httpbin.org/headers')
2 完整的伪装请求封装
import requests
import random
import time
import hashlib
class CompleteSpoofingExample:
"""完整的请求伪装示例"""
@staticmethod
def generate_fingerprint():
"""生成浏览器指纹"""
fingerprint = {
'screen_resolution': f"{random.choice([1920, 1366, 1440, 1536])}x{random.choice([1080, 768, 900, 864])}",
'timezone': random.choice(['Asia/Shanghai', 'America/New_York', 'Europe/London']),
'platform': random.choice(['Win32', 'MacIntel', 'Linux x86_64']),
'languages': random.choice([['zh-CN', 'zh', 'en'], ['en-US', 'en'], ['zh-TW', 'zh', 'en']]),
'device_memory': random.choice([4, 8, 16]),
'hardware_concurrency': random.choice([4, 8, 12, 16])
}
return fingerprint
@staticmethod
def generate_session_hash():
"""生成会话哈希"""
timestamp = str(time.time())
random_str = ''.join(random.choices('abcdefghijklmnopqrstuvwxyz0123456789', k=16))
return hashlib.md5(f"{timestamp}{random_str}".encode()).hexdigest()
def spoofed_request(self, url, method='GET', data=None):
"""执行伪装请求"""
fingerprint = self.generate_fingerprint()
session_hash = self.generate_session_hash()
headers = {
'User-Agent': f"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'DNT': '1', # Do Not Track
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'X-Requested-With': 'XMLHttpRequest',
'X-Session-Id': session_hash,
'X-Device-Info': str(fingerprint)
}
session = requests.Session()
try:
if method == 'GET':
response = session.get(url, headers=headers, timeout=15)
else:
response = session.post(url, headers=headers, data=data, timeout=15)
# 模拟浏览器行为
time.sleep(random.uniform(0.1, 0.5))
return response
except requests.exceptions.RequestException as e:
print(f"请求失败: {e}")
return None
# 使用示例
if __name__ == "__main__":
spoof = CompleteSpoofingExample()
result = spoof.spoofed_request('https://httpbin.org/anything')
if result:
print(f"状态码: {result.status_code}")
print(f"响应内容: {result.text[:200]}")
检测是否被识别
def check_if_detected(response):
"""检测是否被识别为爬虫"""
# 检查响应头
headers = response.headers
indicators = []
if 'X-Block-Status' in headers and headers['X-Block-Status'] == 'blocked':
indicators.append('被明确阻止')
if response.status_code == 403:
indicators.append('返回403禁止访问')
if 'captcha' in response.text.lower() or '验证码' in response.text:
indicators.append('触发了验证码')
if response.status_code == 429:
indicators.append('请求频率限制')
# 检查是否返回了空数据或假数据
if len(response.text) == 0:
indicators.append('返回空数据')
if indicators:
print(f"检测到被识别: {', '.join(indicators)}")
return True
else:
print("请求正常,未被识别")
return False
注意事项
- 合法性:请遵守网站的robots.txt和使用条款
- 频率控制:添加适当的延迟,避免对服务器造成压力
- 代理质量:免费代理通常不稳定,建议使用付费代理服务
- Cookie管理:某些网站会验证cookie的有效性
- :对于JavaScript渲染的页面,可能需要使用Selenium等工具
这些伪装技术应该合理使用,主要用于合法的数据采集和测试目的。
标签: 请求头