30天学会Python编程:20. Python网络爬虫简介
20.1 网络爬虫基础
20.1.1 爬虫定义与原理
20.1.2 法律与道德规范
表19-1 爬虫合法性要点
注意事项 | 说明 | 合规建议 |
robots协议 | 网站访问规则 | 遵守robots.txt |
访问频率 | 请求间隔控制 | 添加适当延迟 |
数据使用 | 版权与隐私 | 仅用于合法用途 |
用户认证 | 登录权限 | 不破解验证机制 |
20.2 请求库使用
20.2.1 requests库
基本使用:
import requests
def fetch_page(url):
try:
response = requests.get(
url,
headers={
'User-Agent': 'Mozilla/5.0',
'Accept-Language': 'zh-CN'
},
timeout=5
)
response.raise_for_status() # 检查HTTP状态码
return response.text
except requests.exceptions.RequestException as e:
print(f"请求失败: {e}")
return None
# 示例使用
html = fetch_page('https://example.com')
20.2.2 高级请求技巧
# 会话保持
session = requests.Session()
session.get('https://example.com/login', params={'user': 'test'})
# 代理设置
proxies = {
'http': 'http://10.10.1.10:3128',
'https': 'http://10.10.1.10:1080'
}
response = requests.get(url, proxies=proxies)
# 文件下载
with requests.get('https://example.com/image.jpg', stream=True) as r:
with open('image.jpg', 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
20.3 数据解析技术
20.3.1 BeautifulSoup解析
from bs4 import BeautifulSoup
def parse_html(html):
soup = BeautifulSoup(html, 'lxml')
# CSS选择器
titles = soup.select('h1.article-title')
# 属性提取
links = [a['href'] for a in soup.find_all('a', class_='external')]
# 文本处理
content = soup.find('div', id='content').get_text(strip=True, separator='\n')
return {
'titles': [t.text for t in titles],
'links': links,
'content': content
}
20.3.2 XPath与lxml
from lxml import etree
def xpath_parse(html):
tree = etree.HTML(html)
# 提取商品价格
prices = tree.xpath('//div[@class="price"]/text()')
# 提取嵌套数据
items = []
for item in tree.xpath('//div[@class="product"]'):
items.append({
'name': item.xpath('.//h2/text()')[0],
'sku': item.xpath('./@data-sku')[0]
})
return {'prices': prices, 'items': items}
20.4 动态页面处理
20.4.1 Selenium自动化
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def selenium_crawl(url):
options = webdriver.ChromeOptions()
options.add_argument('--headless') # 无头模式
driver = webdriver.Chrome(options=options)
try:
driver.get(url)
# 等待元素加载
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".dynamic-content"))
)
# 执行JavaScript
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# 获取渲染后页面
html = driver.page_source
return html
finally:
driver.quit()
20.4.2 接口逆向分析
import json
def api_crawl():
# 分析XHR请求
api_url = 'https://api.example.com/data'
params = {
'page': 1,
'size': 20,
'timestamp': int(time.time()*1000)
}
response = requests.get(api_url, params=params)
data = response.json()
# 解析JSON数据
for item in data['list']:
print(f"商品: {item['name']}, 价格: {item['price']}")
20.5 数据存储方案
20.5.1 文件存储
import csv
import json
def save_to_csv(data, filename):
with open(filename, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=data[0].keys())
writer.writeheader()
writer.writerows(data)
def save_to_json(data, filename):
with open(filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
20.5.2 数据库存储
import sqlite3
import pymongo
# SQLite存储
def sqlite_save(data):
conn = sqlite3.connect('data.db')
c = conn.cursor()
c.execute('''CREATE TABLE IF NOT EXISTS products
(id TEXT, name TEXT, price REAL)''')
c.executemany('INSERT INTO products VALUES (?,?,?)',
[(d['id'], d['name'], d['price']) for d in data])
conn.commit()
# MongoDB存储
def mongo_save(data):
client = pymongo.MongoClient('mongodb://localhost:27017/')
db = client['web_data']
collection = db['products']
collection.insert_many(data)
20.6 反爬应对策略
20.6.1 常见反爬机制
表19-2 常见反爬技术与应对
反爬技术 | 识别特征 | 破解方法 |
User-Agent检测 | 无浏览器特征 | 轮换User-Agent |
IP限制 | 频繁访问被封 | 使用代理IP池 |
验证码 | 出现验证页面 | 打码平台/OCR识别 |
请求参数加密 | 参数含加密字段 | 逆向JS分析 |
动态渲染 | 数据通过JS加载 | Selenium/Puppeteer |
20.6.2 高级反反爬技巧
# 代理IP池示例
class ProxyPool:
def __init__(self):
self.proxies = [
'http://ip1:port',
'http://ip2:port',
# ...
]
self.current = 0
def get_proxy(self):
proxy = self.proxies[self.current % len(self.proxies)]
self.current += 1
return {'http': proxy, 'https': proxy}
# 请求头随机生成
from fake_useragent import UserAgent
ua = UserAgent()
def get_random_headers():
return {
'User-Agent': ua.random,
'Referer': 'https://www.google.com/',
'Accept-Encoding': 'gzip, deflate, br'
}
20.7 应用举例
案例1:电商商品爬虫
import requests
from bs4 import BeautifulSoup
import time
import random
def ecommerce_crawler(base_url, max_page=10):
products = []
for page in range(1, max_page+1):
# 带延迟的请求
time.sleep(random.uniform(1, 3))
url = f"{base_url}?page={page}"
html = fetch_page(url)
if not html:
continue
soup = BeautifulSoup(html, 'lxml')
items = soup.select('.product-item')
for item in items:
try:
products.append({
'name': item.select_one('.name').text.strip(),
'price': float(item.select_one('.price').text.replace('yen', '')),
'sku': item['data-sku'],
'rating': item.select_one('.rating').text.strip()
})
except Exception as e:
print(f"解析失败: {e}")
save_to_csv(products, 'products.csv')
return products
# 使用示例
ecommerce_crawler('https://example.com/products')
案例2:新闻聚合爬虫
import schedule
import datetime
def news_monitor():
sources = [
'https://news.source1.com/rss',
'https://news.source2.com/api/latest'
]
all_news = []
for url in sources:
try:
if 'rss' in url:
# 解析RSS
news = parse_rss(url)
else:
# 调用API
news = parse_news_api(url)
all_news.extend(news)
except Exception as e:
print(f"爬取失败 {url}: {e}")
# 去重存储
store_news(all_news)
print(f"{datetime.datetime.now()} 已抓取{len(all_news)}条新闻")
# 定时任务
schedule.every(1).hours.do(news_monitor)
while True:
schedule.run_pending()
time.sleep(60)
20.8 知识图谱
20.9 学习总结
核心要点:
- 掌握HTTP请求与响应处理
- 熟练使用主流解析工具
- 理解动态页面加载原理
- 能够应对常见反爬措施
实践建议:
- 遵守爬虫道德规范
- 添加随机请求延迟
- 实现异常处理机制
- 定期维护代理池
进阶方向:
- 分布式爬虫架构
- 验证码智能识别
- 数据清洗与分析
- 反爬JS逆向工程
常见陷阱:
- 触发网站防护机制
- 页面结构变更导致解析失败
- 未处理编码问题
- 法律风险意识不足
持续更新Python编程学习日志与技巧,敬请关注!
#编程# #学习# #python# #在头条记录我的2025#