本文目录导读:
我来分享几个Python查找优化的实战案例,从基础到进阶。
案例1:列表查找 vs 集合查找
import time
import random
# 生成测试数据
n = 1000000
data_list = list(range(n))
data_set = set(data_list)
search_items = [random.randint(0, n*2) for _ in range(10000)]
# 列表查找
start = time.time()
for item in search_items:
item in data_list
list_time = time.time() - start
# 集合查找
start = time.time()
for item in search_items:
item in data_set
set_time = time.time() - start
print(f"列表查找时间: {list_time:.4f}秒")
print(f"集合查找时间: {set_time:.4f}秒")
print(f"集合比列表快了 {list_time/set_time:.0f} 倍")
案例2:字典查找优化
from collections import defaultdict, Counter
import time
# 场景:统计单词频率并快速查找
words = ["apple", "banana", "apple", "orange", "banana"] * 100000
# 方法1:普通字典
def count_with_dict():
count = {}
for word in words:
if word in count:
count[word] += 1
else:
count[word] = 1
return count
# 方法2:defaultdict
def count_with_defaultdict():
count = defaultdict(int)
for word in words:
count[word] += 1
return count
# 方法3:Counter(最快)
def count_with_counter():
return Counter(words)
# 性能对比
for func in [count_with_dict, count_with_defaultdict, count_with_counter]:
start = time.time()
result = func()
print(f"{func.__name__}: {time.time() - start:.4f}秒")
案例3:二分查找优化
import bisect
import time
import random
# 有序列表查找优化
data = sorted([random.randint(0, 1000000) for _ in range(100000)])
search_items = [random.randint(0, 1000000) for _ in range(1000)]
# 线性查找
def linear_search(arr, targets):
results = []
for target in targets:
found = False
for item in arr:
if item == target:
found = True
break
results.append(found)
return results
# 二分查找
def binary_search(arr, targets):
results = []
for target in targets:
idx = bisect.bisect_left(arr, target)
results.append(idx < len(arr) and arr[idx] == target)
return results
# 性能对比
start = time.time()
linear_search(data, search_items)
print(f"线性查找: {time.time() - start:.4f}秒")
start = time.time()
binary_search(data, search_items)
print(f"二分查找: {time.time() - start:.4f}秒")
案例4:字符串查找优化
import re
import time
# 场景:在大量文本中查找特定模式
text = "Python" * 100000 + "优化" * 100000
# 方法1:使用in操作符
def find_with_in(text, pattern):
return pattern in text
# 方法2:使用find方法
def find_with_find(text, pattern):
return text.find(pattern) != -1
# 方法3:使用正则表达式(最适合复杂模式)
def find_with_regex(text, pattern):
return bool(re.search(pattern, text))
# 测试性能
patterns = ["Python", "优化", "不存在"]
for pattern in patterns:
print(f"\n查找 '{pattern}':")
for func in [find_with_in, find_with_find, find_with_regex]:
start = time.time()
for _ in range(1000):
func(text, pattern)
print(f"{func.__name__}: {time.time() - start:.4f}秒")
案例5:实战:用户系统查找优化
import hashlib
import time
from collections import defaultdict
# 模拟用户数据
class UserDatabase:
def __init__(self, size=100000):
self.users = []
self.id_index = {}
self.email_index = defaultdict(list)
self.name_index = {}
self._generate_users(size)
def _generate_users(self, size):
for i in range(size):
user = {
'id': i,
'name': f'user_{i}',
'email': f'user{i}@example.com',
'phone': f'138{i:08d}'
}
self.users.append(user)
self.id_index[i] = user
domain = user['email'].split('@')[1]
self.email_index[domain].append(user)
# 无索引查找
def find_by_id_linear(self, user_id):
for user in self.users:
if user['id'] == user_id:
return user
return None
# 有索引查找
def find_by_id_indexed(self, user_id):
return self.id_index.get(user_id)
# 按域名查找(多值索引)
def find_by_domain(self, domain):
return self.email_index.get(domain, [])
# 性能测试
db = UserDatabase(100000)
# 测试ID查找
user_id = 50000
start = time.time()
for _ in range(1000):
db.find_by_id_linear(user_id)
print(f"线性查找: {time.time() - start:.4f}秒")
start = time.time()
for _ in range(1000):
db.find_by_id_indexed(user_id)
print(f"索引查找: {time.time() - start:.4f}秒")
# 测试域名查找
start = time.time()
domain_users = db.find_by_domain('example.com')
print(f"域名查找: 找到 {len(domain_users)} 个用户")
案例6:高级:LRU缓存优化
from functools import lru_cache
import time
# 模拟计算密集型函数
def expensive_function(n):
"""模拟耗时计算"""
time.sleep(0.001) # 模拟计算延迟
return n * n + n
# 不带缓存的版本
def find_without_cache(numbers):
results = []
for n in numbers:
results.append(expensive_function(n))
return results
# 带LRU缓存的版本
@lru_cache(maxsize=1000)
def cached_expensive_function(n):
return expensive_function(n)
def find_with_cache(numbers):
results = []
for n in numbers:
results.append(cached_expensive_function(n))
return results
# 测试数据(包含重复值)
test_data = [i % 500 for i in range(1000)]
# 性能对比
start = time.time()
find_without_cache(test_data)
print(f"无缓存: {time.time() - start:.4f}秒")
start = time.time()
find_with_cache(test_data)
print(f"LRU缓存: {time.time() - start:.4f}秒")
# 查看缓存信息
print(f"缓存命中率: {cached_expensive_function.cache_info()}")
性能优化总结表
| 查找场景 | 优化前 | 优化后 | 速度提升 |
|---|---|---|---|
| 成员检查 | 列表(list) | 集合(set) | 100-1000x |
| 频率统计 | 普通字典 | Counter | 2-5x |
| 有序查找 | 线性搜索 | 二分查找 | 100-1000x |
| 字符串查找 | 复杂模式手写 | 正则表达式 | 10-100x |
| 重复计算 | 每次重新计算 | LRU缓存 | 100-10000x |
选择建议
- 小数据量 (<100):不必过度优化,列表查找足够
- 大数据量:优先考虑集合(set)、字典(dict)
- 频繁查找:建立索引(字典、集合)
- 有序数据:使用二分查找(bisect)
- 重复计算:添加缓存(lru_cache)
这些实战案例覆盖了日常开发中最常见的查找优化场景,你可以根据实际需求选择合适的优化策略。
标签: 哈希索引