claude-code/claude/skills/web-to-markdown/scripts/web_to_md.py

#!/usr/bin/env python3
"""
web_to_md.py - 將網頁轉換為 Markdown

使用方式：
  # 單頁模式
  python web_to_md.py --url https://example.com/features --output output.md

  # 爬蟲模式（追蹤同域連結）
  python web_to_md.py --url https://example.com --crawl --depth 2 --output-dir ./scraped/

  # 爬蟲模式（只爬特定路徑前綴）
  python web_to_md.py --url https://example.com/docs --crawl --depth 3 --same-path --output-dir ./docs/
"""
import argparse
import os
import re
import sys
import time
from collections import deque
from datetime import datetime
from urllib.parse import urljoin, urlparse

try:
    import requests
    from bs4 import BeautifulSoup
    import html2text
except ImportError:
    print("缺少必要套件，請執行：pip install requests beautifulsoup4 html2text lxml")
    sys.exit(1)


# ── 設定 ──────────────────────────────────────────────────────────────────────

DEFAULT_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7",
}

# 自動排除的 URL 路徑關鍵字（不爬這些）
DEFAULT_EXCLUDE_PATTERNS = [
    "/login", "/logout", "/signup", "/register",
    "/cdn-cgi/", "/__", "/static/", "/assets/",
    ".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg",
    ".pdf", ".zip", ".tar", ".gz",
    ".css", ".js", ".woff", ".woff2",
    "/api/", "/feed/", "/rss", "/atom",
    "#",  # 錨點連結
]

# 雜訊元素（移除這些以得到乾淨內文）
NOISE_TAGS = [
    "nav", "header", "footer", "aside",
    "script", "style", "noscript", "iframe",
    "[class*='cookie']", "[class*='banner']", "[class*='popup']",
    "[class*='modal']", "[class*='overlay']", "[id*='cookie']",
    "[class*='nav']", "[class*='footer']", "[class*='header']",
    "[class*='sidebar']", "[class*='advertisement']", "[class*='ad-']",
]


# ── URL 工具 ──────────────────────────────────────────────────────────────────

def normalize_url(url: str) -> str:
    """移除 fragment，標準化 URL"""
    parsed = urlparse(url)
    return parsed._replace(fragment="").geturl().rstrip("/")


def is_same_domain(url: str, base_url: str) -> bool:
    return urlparse(url).netloc == urlparse(base_url).netloc


def is_same_path_prefix(url: str, base_url: str) -> bool:
    base_path = urlparse(base_url).path
    url_path = urlparse(url).path
    return url_path.startswith(base_path)


def should_skip(url: str, exclude_patterns: list[str]) -> bool:
    url_lower = url.lower()
    return any(p in url_lower for p in exclude_patterns)


def url_to_filename(url: str) -> str:
    """將 URL 轉成合適的檔名"""
    parsed = urlparse(url)
    path = parsed.path.strip("/") or "index"
    path = re.sub(r"[^\w\-/]", "-", path)
    path = path.replace("/", "__")
    if parsed.query:
        query = re.sub(r"[^\w\-]", "-", parsed.query)[:50]
        path = f"{path}_{query}"
    return path[:100] + ".md"


# ── HTML → Markdown ───────────────────────────────────────────────────────────

def clean_html(html_content: str, base_url: str) -> BeautifulSoup:
    """移除雜訊元素，保留主要內容"""
    soup = BeautifulSoup(html_content, "lxml")

    # 嘗試找到主要內容區域
    main_content = (
        soup.find("main") or
        soup.find(attrs={"role": "main"}) or
        soup.find("article") or
        soup.find(id=re.compile(r"(content|main|body)", re.I)) or
        soup.find(class_=re.compile(r"(content|main|body|post)", re.I)) or
        soup.body or
        soup
    )

    # 移除雜訊
    for tag in NOISE_TAGS:
        if tag.startswith("["):
            # CSS 選擇器形式
            try:
                for el in main_content.select(tag):
                    el.decompose()
            except Exception:
                pass
        else:
            for el in main_content.find_all(tag):
                el.decompose()

    return main_content


def html_to_markdown(html_content: str, url: str, title: str = "") -> str:
    """將 HTML 轉換為乾淨的 Markdown"""
    cleaned = clean_html(html_content, url)

    # 設定 html2text
    converter = html2text.HTML2Text()
    converter.ignore_links = False
    converter.ignore_images = False
    converter.ignore_tables = False
    converter.body_width = 0         # 不自動換行
    converter.protect_links = True
    converter.wrap_links = False
    converter.mark_code = True
    converter.ul_item_mark = "-"
    converter.emphasis_mark = "*"
    converter.strong_mark = "**"
    converter.baseurl = url

    md = converter.handle(str(cleaned))

    # 基本清理
    md = re.sub(r"\n{3,}", "\n\n", md)   # 移除多餘空行
    md = re.sub(r" +\n", "\n", md)        # 移除行尾空格
    md = md.strip()

    # 加上 Frontmatter
    crawled_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    frontmatter = f"""---
title: {title or '（無標題）'}
url: {url}
crawled_at: {crawled_at}
---

"""
    return frontmatter + md


def fetch_page(url: str, session: requests.Session, timeout: int = 15) -> tuple[str, str]:
    """
    抓取頁面，回傳 (html_content, page_title)
    失敗時回傳 ("", "")
    """
    try:
        resp = session.get(url, timeout=timeout, allow_redirects=True)
        resp.raise_for_status()

        content_type = resp.headers.get("Content-Type", "")
        if "html" not in content_type:
            return "", ""

        resp.encoding = resp.apparent_encoding or "utf-8"
        html = resp.text

        # 取得標題
        soup = BeautifulSoup(html, "lxml")
        title = ""
        if soup.title:
            title = soup.title.string or ""
        if not title and soup.find("h1"):
            title = soup.find("h1").get_text(strip=True)

        return html, title.strip()

    except requests.exceptions.RequestException as e:
        print(f"  ⚠️  抓取失敗 {url}: {e}", file=sys.stderr)
        return "", ""


def extract_links(html: str, base_url: str) -> list[str]:
    """從 HTML 中提取所有連結"""
    soup = BeautifulSoup(html, "lxml")
    links = []
    for tag in soup.find_all("a", href=True):
        href = tag["href"].strip()
        if not href or href.startswith("javascript:") or href.startswith("mailto:"):
            continue
        full_url = urljoin(base_url, href)
        links.append(normalize_url(full_url))
    return links


# ── 主要功能 ──────────────────────────────────────────────────────────────────

def convert_single(url: str, output_path: str, session: requests.Session) -> bool:
    """單頁模式：轉換一個 URL"""
    print(f"📄 抓取：{url}")
    html, title = fetch_page(url, session)
    if not html:
        return False

    md = html_to_markdown(html, url, title)

    os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(md)

    lines = md.count("\n")
    print(f"  ✅ 已儲存：{output_path}（約 {lines} 行）")
    return True


def crawl(
    start_url: str,
    output_dir: str,
    max_depth: int = 2,
    same_path: bool = False,
    max_pages: int = 50,
    delay: float = 1.0,
    exclude_patterns: list[str] = None,
    session: requests.Session = None,
) -> dict:
    """
    爬蟲模式：從 start_url 出發，追蹤同域連結
    回傳 {url: output_file_path} 的 mapping
    """
    if exclude_patterns is None:
        exclude_patterns = DEFAULT_EXCLUDE_PATTERNS

    queue = deque([(normalize_url(start_url), 0)])  # (url, depth)
    visited = set()
    results = {}

    os.makedirs(output_dir, exist_ok=True)
    log_path = os.path.join(output_dir, "_crawl_log.md")

    print(f"\n🕷️  開始爬取：{start_url}")
    print(f"   深度上限：{max_depth}，頁面上限：{max_pages}，延遲：{delay}s")
    if same_path:
        print(f"   路徑限制：只爬 {urlparse(start_url).path} 底下的頁面")
    print()

    with open(log_path, "w", encoding="utf-8") as log:
        log.write(f"# 爬蟲記錄\n\n")
        log.write(f"- **起始 URL**：{start_url}\n")
        log.write(f"- **執行時間**：{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
        log.write("## 已爬取頁面\n\n")

        while queue and len(results) < max_pages:
            url, depth = queue.popleft()

            if url in visited:
                continue
            visited.add(url)

            if should_skip(url, exclude_patterns):
                print(f"  ⏭️  跳過（排除規則）：{url}")
                continue

            if not is_same_domain(url, start_url):
                continue

            if same_path and not is_same_path_prefix(url, start_url):
                continue

            # 抓取
            print(f"  {'  ' * depth}📄 [深度{depth}] {url}")
            html, title = fetch_page(url, session)

            if not html:
                continue

            # 轉換並儲存
            filename = url_to_filename(url)
            output_path = os.path.join(output_dir, filename)
            md = html_to_markdown(html, url, title)

            with open(output_path, "w", encoding="utf-8") as f:
                f.write(md)

            results[url] = output_path
            lines = md.count("\n")
            print(f"  {'  ' * depth}   ✅ → {filename}（{lines} 行）")
            log.write(f"| {url} | [{filename}](./{filename}) | {title} |\n")

            # 如果還沒到最大深度，繼續追蹤連結
            if depth < max_depth:
                links = extract_links(html, url)
                for link in links:
                    if link not in visited and not should_skip(link, exclude_patterns):
                        queue.append((link, depth + 1))

            if queue and delay > 0:
                time.sleep(delay)

        log.write(f"\n---\n共爬取 **{len(results)}** 頁\n")

    print(f"\n✅ 爬取完成：{len(results)} 頁 → {output_dir}")
    print(f"📋 爬取記錄：{log_path}")

    # 輸出索引
    index_path = os.path.join(output_dir, "index.md")
    with open(index_path, "w", encoding="utf-8") as f:
        f.write(f"# 爬取索引：{start_url}\n\n")
        f.write(f"爬取時間：{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
        f.write("## 所有頁面\n\n")
        for url, path in results.items():
            fname = os.path.basename(path)
            f.write(f"- [{url}](./{fname})\n")

    print(f"📑 索引檔案：{index_path}")
    return results


# ── Entry Point ────────────────────────────────────────────────────────────────

def main():
    parser = argparse.ArgumentParser(
        description="將網頁轉換為 Markdown（單頁或爬蟲模式）"
    )
    parser.add_argument("--url", required=True, help="起始 URL")
    parser.add_argument("--crawl", action="store_true", help="啟用爬蟲模式（追蹤同域連結）")
    parser.add_argument("--depth", type=int, default=2, help="爬蟲深度（預設 2）")
    parser.add_argument("--same-path", action="store_true", help="只爬相同路徑前綴下的頁面")
    parser.add_argument("--max-pages", type=int, default=50, help="最多爬取頁數（預設 50）")
    parser.add_argument("--delay", type=float, default=1.0, help="請求間隔秒數（預設 1.0）")
    parser.add_argument("--output", default="output.md", help="單頁模式輸出路徑（預設 output.md）")
    parser.add_argument("--output-dir", default="./scraped/", help="爬蟲模式輸出目錄（預設 ./scraped/）")
    parser.add_argument("--exclude", action="append", default=[], help="排除含有此字串的 URL（可多次使用）")

    args = parser.parse_args()

    # 組合排除規則
    exclude_patterns = DEFAULT_EXCLUDE_PATTERNS + args.exclude

    # 建立 Session
    session = requests.Session()
    session.headers.update(DEFAULT_HEADERS)

    if args.crawl:
        crawl(
            start_url=args.url,
            output_dir=args.output_dir,
            max_depth=args.depth,
            same_path=args.same_path,
            max_pages=args.max_pages,
            delay=args.delay,
            exclude_patterns=exclude_patterns,
            session=session,
        )
    else:
        success = convert_single(args.url, args.output, session)
        sys.exit(0 if success else 1)


if __name__ == "__main__":
    main()