本文目录导读:
我来分享几个Python爬取JSON数据的实战案例。
案例1:爬取免费API接口的JSON数据
import requests
import json
# 案例:获取天气数据(免费API)
def fetch_weather_data():
url = "https://api.open-meteo.com/v1/forecast"
params = {
"latitude": 39.9042,
"longitude": 116.4074,
"current_weather": True,
"hourly": "temperature_2m,precipitation"
}
try:
response = requests.get(url, params=params)
response.raise_for_status()
# 直接获取JSON数据
weather_data = response.json()
# 提取关键信息
current_weather = weather_data['current_weather']
print(f"温度: {current_weather['temperature']}°C")
print(f"风速: {current_weather['windspeed']} km/h")
print(f"天气代码: {current_weather['weathercode']}")
return weather_data
except requests.exceptions.RequestException as e:
print(f"请求失败: {e}")
return None
# 运行示例
weather_info = fetch_weather_data()
案例2:爬取动态加载的JSON数据
import requests
import json
from bs4 import BeautifulSoup
def fetch_dynamic_json():
"""
案例:从某些网站通过XHR请求获取JSON数据
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'application/json, text/plain, */*',
'X-Requested-With': 'XMLHttpRequest'
}
# 模拟API请求
url = "https://jsonplaceholder.typicode.com/posts"
try:
# 发送GET请求
response = requests.get(url, headers=headers)
if response.status_code == 200:
posts = response.json()
# 处理多个JSON对象
for post in posts[:5]: # 只取前5条
print(f"ID: {post['id']}")
print(f"标题: {post['title']}")
print(f"内容: {post['body'][:50]}...")
print("-" * 50)
# 保存到文件
with open('posts.json', 'w', encoding='utf-8') as f:
json.dump(posts, f, ensure_ascii=False, indent=2)
return posts
else:
print(f"请求失败,状态码: {response.status_code}")
except Exception as e:
print(f"发生错误: {e}")
# 运行
data = fetch_dynamic_json()
案例3:爬取需要登录验证的JSON数据
import requests
import json
class DataFetcher:
def __init__(self):
self.session = requests.Session()
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Content-Type': 'application/json'
}
def login(self, username, password):
"""模拟登录获取token"""
login_url = "https://api.example.com/login"
login_data = {
"username": username,
"password": password
}
try:
response = self.session.post(
login_url,
json=login_data,
headers=self.headers
)
if response.status_code == 200:
token = response.json().get('token')
# 保存token到headers
self.headers['Authorization'] = f'Bearer {token}'
return True
except Exception as e:
print(f"登录失败: {e}")
return False
def fetch_protected_data(self):
"""爬取需要认证的数据"""
data_url = "https://api.example.com/protected/data"
try:
response = self.session.get(
data_url,
headers=self.headers
)
if response.status_code == 200:
return response.json()
else:
print(f"获取数据失败: {response.status_code}")
except Exception as e:
print(f"请求失败: {e}")
return None
# 使用示例
# fetcher = DataFetcher()
# if fetcher.login('your_username', 'your_password'):
# data = fetcher.fetch_protected_data()
# print(data)
案例4:爬取分页JSON数据
import requests
import json
import time
def fetch_paginated_data():
"""
案例:爬取分页数据
"""
base_url = "https://api.github.com/search/repositories"
query_params = {
"q": "python",
"sort": "stars",
"order": "desc",
"per_page": 10,
"page": 1
}
all_repos = []
max_pages = 3 # 只爬取3页示例
headers = {
'User-Agent': 'Mozilla/5.0',
'Accept': 'application/json'
}
for page in range(1, max_pages + 1):
query_params['page'] = page
try:
response = requests.get(
base_url,
params=query_params,
headers=headers
)
if response.status_code == 200:
data = response.json()
items = data.get('items', [])
for repo in items:
repo_info = {
'name': repo['full_name'],
'stars': repo['stargazers_count'],
'url': repo['html_url'],
'description': repo.get('description', 'No description')
}
all_repos.append(repo_info)
print(f"第{page}页数据已获取")
time.sleep(1) # 避免请求过快
except Exception as e:
print(f"第{page}页请求失败: {e}")
# 保存所有数据
with open('repositories.json', 'w', encoding='utf-8') as f:
json.dump(all_repos, f, ensure_ascii=False, indent=2)
return all_repos
# 运行
repos = fetch_paginated_data()
print(f"共获取到 {len(repos)} 个仓库信息")
案例5:处理复杂的嵌套JSON数据
import requests
import json
from typing import Dict, List
class JSONDataProcessor:
def __init__(self, data: Dict):
self.data = data
@staticmethod
def fetch_from_url(url: str) -> Dict:
"""从URL获取JSON数据"""
response = requests.get(url)
response.raise_for_status()
return response.json()
def extract_nested_values(self, keys: List[str]) -> List:
"""提取嵌套JSON中的值"""
values = []
def extract_recursive(d, key_path):
if isinstance(d, dict):
if key_path[0] in d:
if len(key_path) == 1:
values.append(d[key_path[0]])
else:
extract_recursive(d[key_path[0]], key_path[1:])
else:
for v in d.values():
extract_recursive(v, key_path)
elif isinstance(d, list):
for item in d:
extract_recursive(item, key_path)
extract_recursive(self.data, keys)
return values
def flatten_json(self, prefix='') -> Dict:
"""扁平化嵌套JSON"""
flat = {}
def process(key, value):
full_key = f"{prefix}.{key}" if prefix else key
if isinstance(value, dict):
for k, v in value.items():
process(k, v)
elif isinstance(value, list):
for i, item in enumerate(value):
process(f"{full_key}_{i}", item)
else:
flat[full_key] = value
for k, v in self.data.items():
process(k, v)
return flat
# 使用示例
url = "https://api.github.com/repos/python/cpython"
try:
data = JSONDataProcessor.fetch_from_url(url)
processor = JSONDataProcessor(data)
# 提取特定嵌套字段
selected = processor.extract_nested_values(['owner', 'login'])
print(f"仓库所有者: {selected}")
# 扁平化JSON
flat_data = processor.flatten_json()
print(f"扁平化后的键值对数量: {len(flat_data)}")
except Exception as e:
print(f"错误: {e}")
- 错误处理:始终使用try-except块处理网络和JSON解析错误
- 请求头设置:添加合适的User-Agent和Content-Type
- 限速:使用time.sleep()避免请求过快
- 数据验证:使用response.raise_for_status()检查请求状态
- 保存数据:及时将数据保存到文件,避免数据丢失
- 动态数据:对于页面动态加载的数据,使用浏览器开发者工具分析XHR请求
这些案例涵盖了JSON数据爬取的主要场景,你可以根据实际需求选择使用。