claude-code/claude/skills/web-to-markdown/scripts/web_to_md.py

380 lines
13 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
web_to_md.py - 將網頁轉換為 Markdown
使用方式:
# 單頁模式
python web_to_md.py --url https://example.com/features --output output.md
# 爬蟲模式(追蹤同域連結)
python web_to_md.py --url https://example.com --crawl --depth 2 --output-dir ./scraped/
# 爬蟲模式(只爬特定路徑前綴)
python web_to_md.py --url https://example.com/docs --crawl --depth 3 --same-path --output-dir ./docs/
"""
import argparse
import os
import re
import sys
import time
from collections import deque
from datetime import datetime
from urllib.parse import urljoin, urlparse
try:
import requests
from bs4 import BeautifulSoup
import html2text
except ImportError:
print("缺少必要套件請執行pip install requests beautifulsoup4 html2text lxml")
sys.exit(1)
# ── 設定 ──────────────────────────────────────────────────────────────────────
DEFAULT_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
),
"Accept-Language": "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7",
}
# 自動排除的 URL 路徑關鍵字(不爬這些)
DEFAULT_EXCLUDE_PATTERNS = [
"/login", "/logout", "/signup", "/register",
"/cdn-cgi/", "/__", "/static/", "/assets/",
".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg",
".pdf", ".zip", ".tar", ".gz",
".css", ".js", ".woff", ".woff2",
"/api/", "/feed/", "/rss", "/atom",
"#", # 錨點連結
]
# 雜訊元素(移除這些以得到乾淨內文)
NOISE_TAGS = [
"nav", "header", "footer", "aside",
"script", "style", "noscript", "iframe",
"[class*='cookie']", "[class*='banner']", "[class*='popup']",
"[class*='modal']", "[class*='overlay']", "[id*='cookie']",
"[class*='nav']", "[class*='footer']", "[class*='header']",
"[class*='sidebar']", "[class*='advertisement']", "[class*='ad-']",
]
# ── URL 工具 ──────────────────────────────────────────────────────────────────
def normalize_url(url: str) -> str:
"""移除 fragment標準化 URL"""
parsed = urlparse(url)
return parsed._replace(fragment="").geturl().rstrip("/")
def is_same_domain(url: str, base_url: str) -> bool:
return urlparse(url).netloc == urlparse(base_url).netloc
def is_same_path_prefix(url: str, base_url: str) -> bool:
base_path = urlparse(base_url).path
url_path = urlparse(url).path
return url_path.startswith(base_path)
def should_skip(url: str, exclude_patterns: list[str]) -> bool:
url_lower = url.lower()
return any(p in url_lower for p in exclude_patterns)
def url_to_filename(url: str) -> str:
"""將 URL 轉成合適的檔名"""
parsed = urlparse(url)
path = parsed.path.strip("/") or "index"
path = re.sub(r"[^\w\-/]", "-", path)
path = path.replace("/", "__")
if parsed.query:
query = re.sub(r"[^\w\-]", "-", parsed.query)[:50]
path = f"{path}_{query}"
return path[:100] + ".md"
# ── HTML → Markdown ───────────────────────────────────────────────────────────
def clean_html(html_content: str, base_url: str) -> BeautifulSoup:
"""移除雜訊元素,保留主要內容"""
soup = BeautifulSoup(html_content, "lxml")
# 嘗試找到主要內容區域
main_content = (
soup.find("main") or
soup.find(attrs={"role": "main"}) or
soup.find("article") or
soup.find(id=re.compile(r"(content|main|body)", re.I)) or
soup.find(class_=re.compile(r"(content|main|body|post)", re.I)) or
soup.body or
soup
)
# 移除雜訊
for tag in NOISE_TAGS:
if tag.startswith("["):
# CSS 選擇器形式
try:
for el in main_content.select(tag):
el.decompose()
except Exception:
pass
else:
for el in main_content.find_all(tag):
el.decompose()
return main_content
def html_to_markdown(html_content: str, url: str, title: str = "") -> str:
"""將 HTML 轉換為乾淨的 Markdown"""
cleaned = clean_html(html_content, url)
# 設定 html2text
converter = html2text.HTML2Text()
converter.ignore_links = False
converter.ignore_images = False
converter.ignore_tables = False
converter.body_width = 0 # 不自動換行
converter.protect_links = True
converter.wrap_links = False
converter.mark_code = True
converter.ul_item_mark = "-"
converter.emphasis_mark = "*"
converter.strong_mark = "**"
converter.baseurl = url
md = converter.handle(str(cleaned))
# 基本清理
md = re.sub(r"\n{3,}", "\n\n", md) # 移除多餘空行
md = re.sub(r" +\n", "\n", md) # 移除行尾空格
md = md.strip()
# 加上 Frontmatter
crawled_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
frontmatter = f"""---
title: {title or '(無標題)'}
url: {url}
crawled_at: {crawled_at}
---
"""
return frontmatter + md
def fetch_page(url: str, session: requests.Session, timeout: int = 15) -> tuple[str, str]:
"""
抓取頁面,回傳 (html_content, page_title)
失敗時回傳 ("", "")
"""
try:
resp = session.get(url, timeout=timeout, allow_redirects=True)
resp.raise_for_status()
content_type = resp.headers.get("Content-Type", "")
if "html" not in content_type:
return "", ""
resp.encoding = resp.apparent_encoding or "utf-8"
html = resp.text
# 取得標題
soup = BeautifulSoup(html, "lxml")
title = ""
if soup.title:
title = soup.title.string or ""
if not title and soup.find("h1"):
title = soup.find("h1").get_text(strip=True)
return html, title.strip()
except requests.exceptions.RequestException as e:
print(f" ⚠️ 抓取失敗 {url}: {e}", file=sys.stderr)
return "", ""
def extract_links(html: str, base_url: str) -> list[str]:
"""從 HTML 中提取所有連結"""
soup = BeautifulSoup(html, "lxml")
links = []
for tag in soup.find_all("a", href=True):
href = tag["href"].strip()
if not href or href.startswith("javascript:") or href.startswith("mailto:"):
continue
full_url = urljoin(base_url, href)
links.append(normalize_url(full_url))
return links
# ── 主要功能 ──────────────────────────────────────────────────────────────────
def convert_single(url: str, output_path: str, session: requests.Session) -> bool:
"""單頁模式:轉換一個 URL"""
print(f"📄 抓取:{url}")
html, title = fetch_page(url, session)
if not html:
return False
md = html_to_markdown(html, url, title)
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
f.write(md)
lines = md.count("\n")
print(f" ✅ 已儲存:{output_path}(約 {lines} 行)")
return True
def crawl(
start_url: str,
output_dir: str,
max_depth: int = 2,
same_path: bool = False,
max_pages: int = 50,
delay: float = 1.0,
exclude_patterns: list[str] = None,
session: requests.Session = None,
) -> dict:
"""
爬蟲模式:從 start_url 出發,追蹤同域連結
回傳 {url: output_file_path} 的 mapping
"""
if exclude_patterns is None:
exclude_patterns = DEFAULT_EXCLUDE_PATTERNS
queue = deque([(normalize_url(start_url), 0)]) # (url, depth)
visited = set()
results = {}
os.makedirs(output_dir, exist_ok=True)
log_path = os.path.join(output_dir, "_crawl_log.md")
print(f"\n🕷️ 開始爬取:{start_url}")
print(f" 深度上限:{max_depth},頁面上限:{max_pages},延遲:{delay}s")
if same_path:
print(f" 路徑限制:只爬 {urlparse(start_url).path} 底下的頁面")
print()
with open(log_path, "w", encoding="utf-8") as log:
log.write(f"# 爬蟲記錄\n\n")
log.write(f"- **起始 URL**{start_url}\n")
log.write(f"- **執行時間**{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
log.write("## 已爬取頁面\n\n")
while queue and len(results) < max_pages:
url, depth = queue.popleft()
if url in visited:
continue
visited.add(url)
if should_skip(url, exclude_patterns):
print(f" ⏭️ 跳過(排除規則):{url}")
continue
if not is_same_domain(url, start_url):
continue
if same_path and not is_same_path_prefix(url, start_url):
continue
# 抓取
print(f" {' ' * depth}📄 [深度{depth}] {url}")
html, title = fetch_page(url, session)
if not html:
continue
# 轉換並儲存
filename = url_to_filename(url)
output_path = os.path.join(output_dir, filename)
md = html_to_markdown(html, url, title)
with open(output_path, "w", encoding="utf-8") as f:
f.write(md)
results[url] = output_path
lines = md.count("\n")
print(f" {' ' * depth} ✅ → {filename}{lines} 行)")
log.write(f"| {url} | [{filename}](./{filename}) | {title} |\n")
# 如果還沒到最大深度,繼續追蹤連結
if depth < max_depth:
links = extract_links(html, url)
for link in links:
if link not in visited and not should_skip(link, exclude_patterns):
queue.append((link, depth + 1))
if queue and delay > 0:
time.sleep(delay)
log.write(f"\n---\n共爬取 **{len(results)}** 頁\n")
print(f"\n✅ 爬取完成:{len(results)} 頁 → {output_dir}")
print(f"📋 爬取記錄:{log_path}")
# 輸出索引
index_path = os.path.join(output_dir, "index.md")
with open(index_path, "w", encoding="utf-8") as f:
f.write(f"# 爬取索引:{start_url}\n\n")
f.write(f"爬取時間:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
f.write("## 所有頁面\n\n")
for url, path in results.items():
fname = os.path.basename(path)
f.write(f"- [{url}](./{fname})\n")
print(f"📑 索引檔案:{index_path}")
return results
# ── Entry Point ────────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(
description="將網頁轉換為 Markdown單頁或爬蟲模式"
)
parser.add_argument("--url", required=True, help="起始 URL")
parser.add_argument("--crawl", action="store_true", help="啟用爬蟲模式(追蹤同域連結)")
parser.add_argument("--depth", type=int, default=2, help="爬蟲深度(預設 2")
parser.add_argument("--same-path", action="store_true", help="只爬相同路徑前綴下的頁面")
parser.add_argument("--max-pages", type=int, default=50, help="最多爬取頁數(預設 50")
parser.add_argument("--delay", type=float, default=1.0, help="請求間隔秒數(預設 1.0")
parser.add_argument("--output", default="output.md", help="單頁模式輸出路徑(預設 output.md")
parser.add_argument("--output-dir", default="./scraped/", help="爬蟲模式輸出目錄(預設 ./scraped/")
parser.add_argument("--exclude", action="append", default=[], help="排除含有此字串的 URL可多次使用")
args = parser.parse_args()
# 組合排除規則
exclude_patterns = DEFAULT_EXCLUDE_PATTERNS + args.exclude
# 建立 Session
session = requests.Session()
session.headers.update(DEFAULT_HEADERS)
if args.crawl:
crawl(
start_url=args.url,
output_dir=args.output_dir,
max_depth=args.depth,
same_path=args.same_path,
max_pages=args.max_pages,
delay=args.delay,
exclude_patterns=exclude_patterns,
session=session,
)
else:
success = convert_single(args.url, args.output, session)
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()