claude-code/claude-zh/skills/web-to-markdown/scripts/web_to_md.py

380 lines
13 KiB
Python
Raw Normal View History

2026-02-27 13:45:37 +00:00
#!/usr/bin/env python3
"""
web_to_md.py - 將網頁轉換為 Markdown
使用方式
# 單頁模式
python web_to_md.py --url https://example.com/features --output output.md
# 爬蟲模式(追蹤同域連結)
python web_to_md.py --url https://example.com --crawl --depth 2 --output-dir ./scraped/
# 爬蟲模式(只爬特定路徑前綴)
python web_to_md.py --url https://example.com/docs --crawl --depth 3 --same-path --output-dir ./docs/
"""
import argparse
import os
import re
import sys
import time
from collections import deque
from datetime import datetime
from urllib.parse import urljoin, urlparse
try:
import requests
from bs4 import BeautifulSoup
import html2text
except ImportError:
print("缺少必要套件請執行pip install requests beautifulsoup4 html2text lxml")
sys.exit(1)
# ── 設定 ──────────────────────────────────────────────────────────────────────
DEFAULT_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
),
"Accept-Language": "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7",
}
# 自動排除的 URL 路徑關鍵字(不爬這些)
DEFAULT_EXCLUDE_PATTERNS = [
"/login", "/logout", "/signup", "/register",
"/cdn-cgi/", "/__", "/static/", "/assets/",
".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg",
".pdf", ".zip", ".tar", ".gz",
".css", ".js", ".woff", ".woff2",
"/api/", "/feed/", "/rss", "/atom",
"#", # 錨點連結
]
# 雜訊元素(移除這些以得到乾淨內文)
NOISE_TAGS = [
"nav", "header", "footer", "aside",
"script", "style", "noscript", "iframe",
"[class*='cookie']", "[class*='banner']", "[class*='popup']",
"[class*='modal']", "[class*='overlay']", "[id*='cookie']",
"[class*='nav']", "[class*='footer']", "[class*='header']",
"[class*='sidebar']", "[class*='advertisement']", "[class*='ad-']",
]
# ── URL 工具 ──────────────────────────────────────────────────────────────────
def normalize_url(url: str) -> str:
"""移除 fragment標準化 URL"""
parsed = urlparse(url)
return parsed._replace(fragment="").geturl().rstrip("/")
def is_same_domain(url: str, base_url: str) -> bool:
return urlparse(url).netloc == urlparse(base_url).netloc
def is_same_path_prefix(url: str, base_url: str) -> bool:
base_path = urlparse(base_url).path
url_path = urlparse(url).path
return url_path.startswith(base_path)
def should_skip(url: str, exclude_patterns: list[str]) -> bool:
url_lower = url.lower()
return any(p in url_lower for p in exclude_patterns)
def url_to_filename(url: str) -> str:
"""將 URL 轉成合適的檔名"""
parsed = urlparse(url)
path = parsed.path.strip("/") or "index"
path = re.sub(r"[^\w\-/]", "-", path)
path = path.replace("/", "__")
if parsed.query:
query = re.sub(r"[^\w\-]", "-", parsed.query)[:50]
path = f"{path}_{query}"
return path[:100] + ".md"
# ── HTML → Markdown ───────────────────────────────────────────────────────────
def clean_html(html_content: str, base_url: str) -> BeautifulSoup:
"""移除雜訊元素,保留主要內容"""
soup = BeautifulSoup(html_content, "lxml")
# 嘗試找到主要內容區域
main_content = (
soup.find("main") or
soup.find(attrs={"role": "main"}) or
soup.find("article") or
soup.find(id=re.compile(r"(content|main|body)", re.I)) or
soup.find(class_=re.compile(r"(content|main|body|post)", re.I)) or
soup.body or
soup
)
# 移除雜訊
for tag in NOISE_TAGS:
if tag.startswith("["):
# CSS 選擇器形式
try:
for el in main_content.select(tag):
el.decompose()
except Exception:
pass
else:
for el in main_content.find_all(tag):
el.decompose()
return main_content
def html_to_markdown(html_content: str, url: str, title: str = "") -> str:
"""將 HTML 轉換為乾淨的 Markdown"""
cleaned = clean_html(html_content, url)
# 設定 html2text
converter = html2text.HTML2Text()
converter.ignore_links = False
converter.ignore_images = False
converter.ignore_tables = False
converter.body_width = 0 # 不自動換行
converter.protect_links = True
converter.wrap_links = False
converter.mark_code = True
converter.ul_item_mark = "-"
converter.emphasis_mark = "*"
converter.strong_mark = "**"
converter.baseurl = url
md = converter.handle(str(cleaned))
# 基本清理
md = re.sub(r"\n{3,}", "\n\n", md) # 移除多餘空行
md = re.sub(r" +\n", "\n", md) # 移除行尾空格
md = md.strip()
# 加上 Frontmatter
crawled_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
frontmatter = f"""---
title: {title or '(無標題)'}
url: {url}
crawled_at: {crawled_at}
---
"""
return frontmatter + md
def fetch_page(url: str, session: requests.Session, timeout: int = 15) -> tuple[str, str]:
"""
抓取頁面回傳 (html_content, page_title)
失敗時回傳 ("", "")
"""
try:
resp = session.get(url, timeout=timeout, allow_redirects=True)
resp.raise_for_status()
content_type = resp.headers.get("Content-Type", "")
if "html" not in content_type:
return "", ""
resp.encoding = resp.apparent_encoding or "utf-8"
html = resp.text
# 取得標題
soup = BeautifulSoup(html, "lxml")
title = ""
if soup.title:
title = soup.title.string or ""
if not title and soup.find("h1"):
title = soup.find("h1").get_text(strip=True)
return html, title.strip()
except requests.exceptions.RequestException as e:
print(f" ⚠️ 抓取失敗 {url}: {e}", file=sys.stderr)
return "", ""
def extract_links(html: str, base_url: str) -> list[str]:
"""從 HTML 中提取所有連結"""
soup = BeautifulSoup(html, "lxml")
links = []
for tag in soup.find_all("a", href=True):
href = tag["href"].strip()
if not href or href.startswith("javascript:") or href.startswith("mailto:"):
continue
full_url = urljoin(base_url, href)
links.append(normalize_url(full_url))
return links
# ── 主要功能 ──────────────────────────────────────────────────────────────────
def convert_single(url: str, output_path: str, session: requests.Session) -> bool:
"""單頁模式:轉換一個 URL"""
print(f"📄 抓取:{url}")
html, title = fetch_page(url, session)
if not html:
return False
md = html_to_markdown(html, url, title)
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
f.write(md)
lines = md.count("\n")
print(f" ✅ 已儲存:{output_path}(約 {lines} 行)")
return True
def crawl(
start_url: str,
output_dir: str,
max_depth: int = 2,
same_path: bool = False,
max_pages: int = 50,
delay: float = 1.0,
exclude_patterns: list[str] = None,
session: requests.Session = None,
) -> dict:
"""
爬蟲模式 start_url 出發追蹤同域連結
回傳 {url: output_file_path} mapping
"""
if exclude_patterns is None:
exclude_patterns = DEFAULT_EXCLUDE_PATTERNS
queue = deque([(normalize_url(start_url), 0)]) # (url, depth)
visited = set()
results = {}
os.makedirs(output_dir, exist_ok=True)
log_path = os.path.join(output_dir, "_crawl_log.md")
print(f"\n🕷️ 開始爬取:{start_url}")
print(f" 深度上限:{max_depth},頁面上限:{max_pages},延遲:{delay}s")
if same_path:
print(f" 路徑限制:只爬 {urlparse(start_url).path} 底下的頁面")
print()
with open(log_path, "w", encoding="utf-8") as log:
log.write(f"# 爬蟲記錄\n\n")
log.write(f"- **起始 URL**{start_url}\n")
log.write(f"- **執行時間**{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
log.write("## 已爬取頁面\n\n")
while queue and len(results) < max_pages:
url, depth = queue.popleft()
if url in visited:
continue
visited.add(url)
if should_skip(url, exclude_patterns):
print(f" ⏭️ 跳過(排除規則):{url}")
continue
if not is_same_domain(url, start_url):
continue
if same_path and not is_same_path_prefix(url, start_url):
continue
# 抓取
print(f" {' ' * depth}📄 [深度{depth}] {url}")
html, title = fetch_page(url, session)
if not html:
continue
# 轉換並儲存
filename = url_to_filename(url)
output_path = os.path.join(output_dir, filename)
md = html_to_markdown(html, url, title)
with open(output_path, "w", encoding="utf-8") as f:
f.write(md)
results[url] = output_path
lines = md.count("\n")
print(f" {' ' * depth} ✅ → {filename}{lines} 行)")
log.write(f"| {url} | [{filename}](./{filename}) | {title} |\n")
# 如果還沒到最大深度,繼續追蹤連結
if depth < max_depth:
links = extract_links(html, url)
for link in links:
if link not in visited and not should_skip(link, exclude_patterns):
queue.append((link, depth + 1))
if queue and delay > 0:
time.sleep(delay)
log.write(f"\n---\n共爬取 **{len(results)}** 頁\n")
print(f"\n✅ 爬取完成:{len(results)} 頁 → {output_dir}")
print(f"📋 爬取記錄:{log_path}")
# 輸出索引
index_path = os.path.join(output_dir, "index.md")
with open(index_path, "w", encoding="utf-8") as f:
f.write(f"# 爬取索引:{start_url}\n\n")
f.write(f"爬取時間:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
f.write("## 所有頁面\n\n")
for url, path in results.items():
fname = os.path.basename(path)
f.write(f"- [{url}](./{fname})\n")
print(f"📑 索引檔案:{index_path}")
return results
# ── Entry Point ────────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(
description="將網頁轉換為 Markdown單頁或爬蟲模式"
)
parser.add_argument("--url", required=True, help="起始 URL")
parser.add_argument("--crawl", action="store_true", help="啟用爬蟲模式(追蹤同域連結)")
parser.add_argument("--depth", type=int, default=2, help="爬蟲深度(預設 2")
parser.add_argument("--same-path", action="store_true", help="只爬相同路徑前綴下的頁面")
parser.add_argument("--max-pages", type=int, default=50, help="最多爬取頁數(預設 50")
parser.add_argument("--delay", type=float, default=1.0, help="請求間隔秒數(預設 1.0")
parser.add_argument("--output", default="output.md", help="單頁模式輸出路徑(預設 output.md")
parser.add_argument("--output-dir", default="./scraped/", help="爬蟲模式輸出目錄(預設 ./scraped/")
parser.add_argument("--exclude", action="append", default=[], help="排除含有此字串的 URL可多次使用")
args = parser.parse_args()
# 組合排除規則
exclude_patterns = DEFAULT_EXCLUDE_PATTERNS + args.exclude
# 建立 Session
session = requests.Session()
session.headers.update(DEFAULT_HEADERS)
if args.crawl:
crawl(
start_url=args.url,
output_dir=args.output_dir,
max_depth=args.depth,
same_path=args.same_path,
max_pages=args.max_pages,
delay=args.delay,
exclude_patterns=exclude_patterns,
session=session,
)
else:
success = convert_single(args.url, args.output, session)
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()