#6 Drift
最终方案用的是playwright,代码如下 我还写了代码拼接功能,把JS和CSS离线下来
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
import os
import requests
from urllib.parse import urljoin, urlparse
def save_complete_page(output_dir="saved_page"):
# 创建输出目录
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# 创建子目录
assets_dirs = {
"css": os.path.join(output_dir, "css"),
"js": os.path.join(output_dir, "js"),
"images": os.path.join(output_dir, "images"),
"fonts": os.path.join(output_dir, "fonts"),
}
for dir_path in assets_dirs.values():
os.makedirs(dir_path, exist_ok=True)
# 启动浏览器
with sync_playwright() as p:
browser = p.chromium.launch(headless=False) # 显示浏览器窗口
context = browser.new_context()
page = context.new_page()
# 打开目标网页
target_url = "https://XXX.com"
print(f"正在加载页面: {target_url}")
page.goto(target_url, wait_until="networkidle") # 等待页面所有网络请求完成
# 提示用户手动操作
print("浏览器已启动,请手动检查页面内容,完成后按下 Enter 键继续抓取...")
input("按下 Enter 键继续抓取页面内容...")
# 获取渲染后的 HTML
html_content = page.content()
# 使用 BeautifulSoup 解析 HTML
soup = BeautifulSoup(html_content, "html.parser")
# 下载并替换 CSS 文件
for link in soup.find_all("link", {"rel": "stylesheet"}):
href = link.get("href")
if href:
local_path = download_asset(href, target_url, assets_dirs["css"])
if local_path:
link["href"] = os.path.relpath(local_path, output_dir)
# 下载并替换 JS 文件
for script in soup.find_all("script", {"src": True}):
src = script.get("src")
if src:
local_path = download_asset(src, target_url, assets_dirs["js"])
if local_path:
script["src"] = os.path.relpath(local_path, output_dir)
# 下载并替换图片文件
for img in soup.find_all("img", {"src": True}):
src = img.get("src")
if src:
local_path = download_asset(src, target_url, assets_dirs["images"])
if local_path:
img["src"] = os.path.relpath(local_path, output_dir)
# 下载并替换字体文件
for font in soup.find_all("link", {"rel": "stylesheet"}):
href = font.get("href")
if href and (href.endswith(".woff") or href.endswith(".woff2") or href.endswith(".ttf")):
local_path = download_asset(href, target_url, assets_dirs["fonts"])
if local_path:
font["href"] = os.path.relpath(local_path, output_dir)
# 保存更新后的 HTML 文件
html_file_path = os.path.join(output_dir, "index.html")
with open(html_file_path, "w", encoding="utf-8") as f:
f.write(soup.prettify())
print(f"HTML 文件已保存到: {html_file_path}")
# 关闭浏览器
browser.close()
def download_asset(asset_url, base_url, save_dir):
"""下载静态资源到本地"""
asset_url = urljoin(base_url, asset_url) # 拼接完整 URL
parsed_url = urlparse(asset_url)
filename = os.path.basename(parsed_url.path)
local_path = os.path.join(save_dir, filename)
try:
print(f"正在下载: {asset_url}")
response = requests.get(asset_url, stream=True, timeout=10)
if response.status_code == 200:
with open(local_path, "wb") as f:
for chunk in response.iter_content(1024):
f.write(chunk)
return local_path
else:
print(f"下载失败: {asset_url} (状态码: {response.status_code})")
except Exception as e:
print(f"下载失败: {asset_url} (错误: {e})")
return None
if __name__ == "__main__":
save_complete_page()