#!/usr/bin/env python3 """ web_to_md.py - 將網頁轉換為 Markdown 使用方式: # 單頁模式 python web_to_md.py --url https://example.com/features --output output.md # 爬蟲模式(追蹤同域連結) python web_to_md.py --url https://example.com --crawl --depth 2 --output-dir ./scraped/ # 爬蟲模式(只爬特定路徑前綴) python web_to_md.py --url https://example.com/docs --crawl --depth 3 --same-path --output-dir ./docs/ """ import argparse import os import re import sys import time from collections import deque from datetime import datetime from urllib.parse import urljoin, urlparse try: import requests from bs4 import BeautifulSoup import html2text except ImportError: print("缺少必要套件,請執行:pip install requests beautifulsoup4 html2text lxml") sys.exit(1) # ── 設定 ────────────────────────────────────────────────────────────────────── DEFAULT_HEADERS = { "User-Agent": ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/120.0.0.0 Safari/537.36" ), "Accept-Language": "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7", } # 自動排除的 URL 路徑關鍵字(不爬這些) DEFAULT_EXCLUDE_PATTERNS = [ "/login", "/logout", "/signup", "/register", "/cdn-cgi/", "/__", "/static/", "/assets/", ".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg", ".pdf", ".zip", ".tar", ".gz", ".css", ".js", ".woff", ".woff2", "/api/", "/feed/", "/rss", "/atom", "#", # 錨點連結 ] # 雜訊元素(移除這些以得到乾淨內文) NOISE_TAGS = [ "nav", "header", "footer", "aside", "script", "style", "noscript", "iframe", "[class*='cookie']", "[class*='banner']", "[class*='popup']", "[class*='modal']", "[class*='overlay']", "[id*='cookie']", "[class*='nav']", "[class*='footer']", "[class*='header']", "[class*='sidebar']", "[class*='advertisement']", "[class*='ad-']", ] # ── URL 工具 ────────────────────────────────────────────────────────────────── def normalize_url(url: str) -> str: """移除 fragment,標準化 URL""" parsed = urlparse(url) return parsed._replace(fragment="").geturl().rstrip("/") def is_same_domain(url: str, base_url: str) -> bool: return urlparse(url).netloc == urlparse(base_url).netloc def is_same_path_prefix(url: str, base_url: str) -> bool: base_path = urlparse(base_url).path url_path = urlparse(url).path return url_path.startswith(base_path) def should_skip(url: str, exclude_patterns: list[str]) -> bool: url_lower = url.lower() return any(p in url_lower for p in exclude_patterns) def url_to_filename(url: str) -> str: """將 URL 轉成合適的檔名""" parsed = urlparse(url) path = parsed.path.strip("/") or "index" path = re.sub(r"[^\w\-/]", "-", path) path = path.replace("/", "__") if parsed.query: query = re.sub(r"[^\w\-]", "-", parsed.query)[:50] path = f"{path}_{query}" return path[:100] + ".md" # ── HTML → Markdown ─────────────────────────────────────────────────────────── def clean_html(html_content: str, base_url: str) -> BeautifulSoup: """移除雜訊元素,保留主要內容""" soup = BeautifulSoup(html_content, "lxml") # 嘗試找到主要內容區域 main_content = ( soup.find("main") or soup.find(attrs={"role": "main"}) or soup.find("article") or soup.find(id=re.compile(r"(content|main|body)", re.I)) or soup.find(class_=re.compile(r"(content|main|body|post)", re.I)) or soup.body or soup ) # 移除雜訊 for tag in NOISE_TAGS: if tag.startswith("["): # CSS 選擇器形式 try: for el in main_content.select(tag): el.decompose() except Exception: pass else: for el in main_content.find_all(tag): el.decompose() return main_content def html_to_markdown(html_content: str, url: str, title: str = "") -> str: """將 HTML 轉換為乾淨的 Markdown""" cleaned = clean_html(html_content, url) # 設定 html2text converter = html2text.HTML2Text() converter.ignore_links = False converter.ignore_images = False converter.ignore_tables = False converter.body_width = 0 # 不自動換行 converter.protect_links = True converter.wrap_links = False converter.mark_code = True converter.ul_item_mark = "-" converter.emphasis_mark = "*" converter.strong_mark = "**" converter.baseurl = url md = converter.handle(str(cleaned)) # 基本清理 md = re.sub(r"\n{3,}", "\n\n", md) # 移除多餘空行 md = re.sub(r" +\n", "\n", md) # 移除行尾空格 md = md.strip() # 加上 Frontmatter crawled_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S") frontmatter = f"""--- title: {title or '(無標題)'} url: {url} crawled_at: {crawled_at} --- """ return frontmatter + md def fetch_page(url: str, session: requests.Session, timeout: int = 15) -> tuple[str, str]: """ 抓取頁面,回傳 (html_content, page_title) 失敗時回傳 ("", "") """ try: resp = session.get(url, timeout=timeout, allow_redirects=True) resp.raise_for_status() content_type = resp.headers.get("Content-Type", "") if "html" not in content_type: return "", "" resp.encoding = resp.apparent_encoding or "utf-8" html = resp.text # 取得標題 soup = BeautifulSoup(html, "lxml") title = "" if soup.title: title = soup.title.string or "" if not title and soup.find("h1"): title = soup.find("h1").get_text(strip=True) return html, title.strip() except requests.exceptions.RequestException as e: print(f" ⚠️ 抓取失敗 {url}: {e}", file=sys.stderr) return "", "" def extract_links(html: str, base_url: str) -> list[str]: """從 HTML 中提取所有連結""" soup = BeautifulSoup(html, "lxml") links = [] for tag in soup.find_all("a", href=True): href = tag["href"].strip() if not href or href.startswith("javascript:") or href.startswith("mailto:"): continue full_url = urljoin(base_url, href) links.append(normalize_url(full_url)) return links # ── 主要功能 ────────────────────────────────────────────────────────────────── def convert_single(url: str, output_path: str, session: requests.Session) -> bool: """單頁模式:轉換一個 URL""" print(f"📄 抓取:{url}") html, title = fetch_page(url, session) if not html: return False md = html_to_markdown(html, url, title) os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True) with open(output_path, "w", encoding="utf-8") as f: f.write(md) lines = md.count("\n") print(f" ✅ 已儲存:{output_path}(約 {lines} 行)") return True def crawl( start_url: str, output_dir: str, max_depth: int = 2, same_path: bool = False, max_pages: int = 50, delay: float = 1.0, exclude_patterns: list[str] = None, session: requests.Session = None, ) -> dict: """ 爬蟲模式:從 start_url 出發,追蹤同域連結 回傳 {url: output_file_path} 的 mapping """ if exclude_patterns is None: exclude_patterns = DEFAULT_EXCLUDE_PATTERNS queue = deque([(normalize_url(start_url), 0)]) # (url, depth) visited = set() results = {} os.makedirs(output_dir, exist_ok=True) log_path = os.path.join(output_dir, "_crawl_log.md") print(f"\n🕷️ 開始爬取:{start_url}") print(f" 深度上限:{max_depth},頁面上限:{max_pages},延遲:{delay}s") if same_path: print(f" 路徑限制:只爬 {urlparse(start_url).path} 底下的頁面") print() with open(log_path, "w", encoding="utf-8") as log: log.write(f"# 爬蟲記錄\n\n") log.write(f"- **起始 URL**:{start_url}\n") log.write(f"- **執行時間**:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") log.write("## 已爬取頁面\n\n") while queue and len(results) < max_pages: url, depth = queue.popleft() if url in visited: continue visited.add(url) if should_skip(url, exclude_patterns): print(f" ⏭️ 跳過(排除規則):{url}") continue if not is_same_domain(url, start_url): continue if same_path and not is_same_path_prefix(url, start_url): continue # 抓取 print(f" {' ' * depth}📄 [深度{depth}] {url}") html, title = fetch_page(url, session) if not html: continue # 轉換並儲存 filename = url_to_filename(url) output_path = os.path.join(output_dir, filename) md = html_to_markdown(html, url, title) with open(output_path, "w", encoding="utf-8") as f: f.write(md) results[url] = output_path lines = md.count("\n") print(f" {' ' * depth} ✅ → {filename}({lines} 行)") log.write(f"| {url} | [{filename}](./{filename}) | {title} |\n") # 如果還沒到最大深度,繼續追蹤連結 if depth < max_depth: links = extract_links(html, url) for link in links: if link not in visited and not should_skip(link, exclude_patterns): queue.append((link, depth + 1)) if queue and delay > 0: time.sleep(delay) log.write(f"\n---\n共爬取 **{len(results)}** 頁\n") print(f"\n✅ 爬取完成:{len(results)} 頁 → {output_dir}") print(f"📋 爬取記錄:{log_path}") # 輸出索引 index_path = os.path.join(output_dir, "index.md") with open(index_path, "w", encoding="utf-8") as f: f.write(f"# 爬取索引:{start_url}\n\n") f.write(f"爬取時間:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") f.write("## 所有頁面\n\n") for url, path in results.items(): fname = os.path.basename(path) f.write(f"- [{url}](./{fname})\n") print(f"📑 索引檔案:{index_path}") return results # ── Entry Point ──────────────────────────────────────────────────────────────── def main(): parser = argparse.ArgumentParser( description="將網頁轉換為 Markdown(單頁或爬蟲模式)" ) parser.add_argument("--url", required=True, help="起始 URL") parser.add_argument("--crawl", action="store_true", help="啟用爬蟲模式(追蹤同域連結)") parser.add_argument("--depth", type=int, default=2, help="爬蟲深度(預設 2)") parser.add_argument("--same-path", action="store_true", help="只爬相同路徑前綴下的頁面") parser.add_argument("--max-pages", type=int, default=50, help="最多爬取頁數(預設 50)") parser.add_argument("--delay", type=float, default=1.0, help="請求間隔秒數(預設 1.0)") parser.add_argument("--output", default="output.md", help="單頁模式輸出路徑(預設 output.md)") parser.add_argument("--output-dir", default="./scraped/", help="爬蟲模式輸出目錄(預設 ./scraped/)") parser.add_argument("--exclude", action="append", default=[], help="排除含有此字串的 URL(可多次使用)") args = parser.parse_args() # 組合排除規則 exclude_patterns = DEFAULT_EXCLUDE_PATTERNS + args.exclude # 建立 Session session = requests.Session() session.headers.update(DEFAULT_HEADERS) if args.crawl: crawl( start_url=args.url, output_dir=args.output_dir, max_depth=args.depth, same_path=args.same_path, max_pages=args.max_pages, delay=args.delay, exclude_patterns=exclude_patterns, session=session, ) else: success = convert_single(args.url, args.output, session) sys.exit(0 if success else 1) if __name__ == "__main__": main()