380 lines
13 KiB
Python
380 lines
13 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
web_to_md.py - 將網頁轉換為 Markdown
|
|||
|
|
|
|||
|
|
使用方式:
|
|||
|
|
# 單頁模式
|
|||
|
|
python web_to_md.py --url https://example.com/features --output output.md
|
|||
|
|
|
|||
|
|
# 爬蟲模式(追蹤同域連結)
|
|||
|
|
python web_to_md.py --url https://example.com --crawl --depth 2 --output-dir ./scraped/
|
|||
|
|
|
|||
|
|
# 爬蟲模式(只爬特定路徑前綴)
|
|||
|
|
python web_to_md.py --url https://example.com/docs --crawl --depth 3 --same-path --output-dir ./docs/
|
|||
|
|
"""
|
|||
|
|
import argparse
|
|||
|
|
import os
|
|||
|
|
import re
|
|||
|
|
import sys
|
|||
|
|
import time
|
|||
|
|
from collections import deque
|
|||
|
|
from datetime import datetime
|
|||
|
|
from urllib.parse import urljoin, urlparse
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
import requests
|
|||
|
|
from bs4 import BeautifulSoup
|
|||
|
|
import html2text
|
|||
|
|
except ImportError:
|
|||
|
|
print("缺少必要套件,請執行:pip install requests beautifulsoup4 html2text lxml")
|
|||
|
|
sys.exit(1)
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ── 設定 ──────────────────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
DEFAULT_HEADERS = {
|
|||
|
|
"User-Agent": (
|
|||
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
|||
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|||
|
|
"Chrome/120.0.0.0 Safari/537.36"
|
|||
|
|
),
|
|||
|
|
"Accept-Language": "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7",
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 自動排除的 URL 路徑關鍵字(不爬這些)
|
|||
|
|
DEFAULT_EXCLUDE_PATTERNS = [
|
|||
|
|
"/login", "/logout", "/signup", "/register",
|
|||
|
|
"/cdn-cgi/", "/__", "/static/", "/assets/",
|
|||
|
|
".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg",
|
|||
|
|
".pdf", ".zip", ".tar", ".gz",
|
|||
|
|
".css", ".js", ".woff", ".woff2",
|
|||
|
|
"/api/", "/feed/", "/rss", "/atom",
|
|||
|
|
"#", # 錨點連結
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
# 雜訊元素(移除這些以得到乾淨內文)
|
|||
|
|
NOISE_TAGS = [
|
|||
|
|
"nav", "header", "footer", "aside",
|
|||
|
|
"script", "style", "noscript", "iframe",
|
|||
|
|
"[class*='cookie']", "[class*='banner']", "[class*='popup']",
|
|||
|
|
"[class*='modal']", "[class*='overlay']", "[id*='cookie']",
|
|||
|
|
"[class*='nav']", "[class*='footer']", "[class*='header']",
|
|||
|
|
"[class*='sidebar']", "[class*='advertisement']", "[class*='ad-']",
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ── URL 工具 ──────────────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def normalize_url(url: str) -> str:
|
|||
|
|
"""移除 fragment,標準化 URL"""
|
|||
|
|
parsed = urlparse(url)
|
|||
|
|
return parsed._replace(fragment="").geturl().rstrip("/")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def is_same_domain(url: str, base_url: str) -> bool:
|
|||
|
|
return urlparse(url).netloc == urlparse(base_url).netloc
|
|||
|
|
|
|||
|
|
|
|||
|
|
def is_same_path_prefix(url: str, base_url: str) -> bool:
|
|||
|
|
base_path = urlparse(base_url).path
|
|||
|
|
url_path = urlparse(url).path
|
|||
|
|
return url_path.startswith(base_path)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def should_skip(url: str, exclude_patterns: list[str]) -> bool:
|
|||
|
|
url_lower = url.lower()
|
|||
|
|
return any(p in url_lower for p in exclude_patterns)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def url_to_filename(url: str) -> str:
|
|||
|
|
"""將 URL 轉成合適的檔名"""
|
|||
|
|
parsed = urlparse(url)
|
|||
|
|
path = parsed.path.strip("/") or "index"
|
|||
|
|
path = re.sub(r"[^\w\-/]", "-", path)
|
|||
|
|
path = path.replace("/", "__")
|
|||
|
|
if parsed.query:
|
|||
|
|
query = re.sub(r"[^\w\-]", "-", parsed.query)[:50]
|
|||
|
|
path = f"{path}_{query}"
|
|||
|
|
return path[:100] + ".md"
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ── HTML → Markdown ───────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def clean_html(html_content: str, base_url: str) -> BeautifulSoup:
|
|||
|
|
"""移除雜訊元素,保留主要內容"""
|
|||
|
|
soup = BeautifulSoup(html_content, "lxml")
|
|||
|
|
|
|||
|
|
# 嘗試找到主要內容區域
|
|||
|
|
main_content = (
|
|||
|
|
soup.find("main") or
|
|||
|
|
soup.find(attrs={"role": "main"}) or
|
|||
|
|
soup.find("article") or
|
|||
|
|
soup.find(id=re.compile(r"(content|main|body)", re.I)) or
|
|||
|
|
soup.find(class_=re.compile(r"(content|main|body|post)", re.I)) or
|
|||
|
|
soup.body or
|
|||
|
|
soup
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 移除雜訊
|
|||
|
|
for tag in NOISE_TAGS:
|
|||
|
|
if tag.startswith("["):
|
|||
|
|
# CSS 選擇器形式
|
|||
|
|
try:
|
|||
|
|
for el in main_content.select(tag):
|
|||
|
|
el.decompose()
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
else:
|
|||
|
|
for el in main_content.find_all(tag):
|
|||
|
|
el.decompose()
|
|||
|
|
|
|||
|
|
return main_content
|
|||
|
|
|
|||
|
|
|
|||
|
|
def html_to_markdown(html_content: str, url: str, title: str = "") -> str:
|
|||
|
|
"""將 HTML 轉換為乾淨的 Markdown"""
|
|||
|
|
cleaned = clean_html(html_content, url)
|
|||
|
|
|
|||
|
|
# 設定 html2text
|
|||
|
|
converter = html2text.HTML2Text()
|
|||
|
|
converter.ignore_links = False
|
|||
|
|
converter.ignore_images = False
|
|||
|
|
converter.ignore_tables = False
|
|||
|
|
converter.body_width = 0 # 不自動換行
|
|||
|
|
converter.protect_links = True
|
|||
|
|
converter.wrap_links = False
|
|||
|
|
converter.mark_code = True
|
|||
|
|
converter.ul_item_mark = "-"
|
|||
|
|
converter.emphasis_mark = "*"
|
|||
|
|
converter.strong_mark = "**"
|
|||
|
|
converter.baseurl = url
|
|||
|
|
|
|||
|
|
md = converter.handle(str(cleaned))
|
|||
|
|
|
|||
|
|
# 基本清理
|
|||
|
|
md = re.sub(r"\n{3,}", "\n\n", md) # 移除多餘空行
|
|||
|
|
md = re.sub(r" +\n", "\n", md) # 移除行尾空格
|
|||
|
|
md = md.strip()
|
|||
|
|
|
|||
|
|
# 加上 Frontmatter
|
|||
|
|
crawled_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|||
|
|
frontmatter = f"""---
|
|||
|
|
title: {title or '(無標題)'}
|
|||
|
|
url: {url}
|
|||
|
|
crawled_at: {crawled_at}
|
|||
|
|
---
|
|||
|
|
|
|||
|
|
"""
|
|||
|
|
return frontmatter + md
|
|||
|
|
|
|||
|
|
|
|||
|
|
def fetch_page(url: str, session: requests.Session, timeout: int = 15) -> tuple[str, str]:
|
|||
|
|
"""
|
|||
|
|
抓取頁面,回傳 (html_content, page_title)
|
|||
|
|
失敗時回傳 ("", "")
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
resp = session.get(url, timeout=timeout, allow_redirects=True)
|
|||
|
|
resp.raise_for_status()
|
|||
|
|
|
|||
|
|
content_type = resp.headers.get("Content-Type", "")
|
|||
|
|
if "html" not in content_type:
|
|||
|
|
return "", ""
|
|||
|
|
|
|||
|
|
resp.encoding = resp.apparent_encoding or "utf-8"
|
|||
|
|
html = resp.text
|
|||
|
|
|
|||
|
|
# 取得標題
|
|||
|
|
soup = BeautifulSoup(html, "lxml")
|
|||
|
|
title = ""
|
|||
|
|
if soup.title:
|
|||
|
|
title = soup.title.string or ""
|
|||
|
|
if not title and soup.find("h1"):
|
|||
|
|
title = soup.find("h1").get_text(strip=True)
|
|||
|
|
|
|||
|
|
return html, title.strip()
|
|||
|
|
|
|||
|
|
except requests.exceptions.RequestException as e:
|
|||
|
|
print(f" ⚠️ 抓取失敗 {url}: {e}", file=sys.stderr)
|
|||
|
|
return "", ""
|
|||
|
|
|
|||
|
|
|
|||
|
|
def extract_links(html: str, base_url: str) -> list[str]:
|
|||
|
|
"""從 HTML 中提取所有連結"""
|
|||
|
|
soup = BeautifulSoup(html, "lxml")
|
|||
|
|
links = []
|
|||
|
|
for tag in soup.find_all("a", href=True):
|
|||
|
|
href = tag["href"].strip()
|
|||
|
|
if not href or href.startswith("javascript:") or href.startswith("mailto:"):
|
|||
|
|
continue
|
|||
|
|
full_url = urljoin(base_url, href)
|
|||
|
|
links.append(normalize_url(full_url))
|
|||
|
|
return links
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ── 主要功能 ──────────────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def convert_single(url: str, output_path: str, session: requests.Session) -> bool:
|
|||
|
|
"""單頁模式:轉換一個 URL"""
|
|||
|
|
print(f"📄 抓取:{url}")
|
|||
|
|
html, title = fetch_page(url, session)
|
|||
|
|
if not html:
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
md = html_to_markdown(html, url, title)
|
|||
|
|
|
|||
|
|
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
|
|||
|
|
with open(output_path, "w", encoding="utf-8") as f:
|
|||
|
|
f.write(md)
|
|||
|
|
|
|||
|
|
lines = md.count("\n")
|
|||
|
|
print(f" ✅ 已儲存:{output_path}(約 {lines} 行)")
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
|
|||
|
|
def crawl(
|
|||
|
|
start_url: str,
|
|||
|
|
output_dir: str,
|
|||
|
|
max_depth: int = 2,
|
|||
|
|
same_path: bool = False,
|
|||
|
|
max_pages: int = 50,
|
|||
|
|
delay: float = 1.0,
|
|||
|
|
exclude_patterns: list[str] = None,
|
|||
|
|
session: requests.Session = None,
|
|||
|
|
) -> dict:
|
|||
|
|
"""
|
|||
|
|
爬蟲模式:從 start_url 出發,追蹤同域連結
|
|||
|
|
回傳 {url: output_file_path} 的 mapping
|
|||
|
|
"""
|
|||
|
|
if exclude_patterns is None:
|
|||
|
|
exclude_patterns = DEFAULT_EXCLUDE_PATTERNS
|
|||
|
|
|
|||
|
|
queue = deque([(normalize_url(start_url), 0)]) # (url, depth)
|
|||
|
|
visited = set()
|
|||
|
|
results = {}
|
|||
|
|
|
|||
|
|
os.makedirs(output_dir, exist_ok=True)
|
|||
|
|
log_path = os.path.join(output_dir, "_crawl_log.md")
|
|||
|
|
|
|||
|
|
print(f"\n🕷️ 開始爬取:{start_url}")
|
|||
|
|
print(f" 深度上限:{max_depth},頁面上限:{max_pages},延遲:{delay}s")
|
|||
|
|
if same_path:
|
|||
|
|
print(f" 路徑限制:只爬 {urlparse(start_url).path} 底下的頁面")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
with open(log_path, "w", encoding="utf-8") as log:
|
|||
|
|
log.write(f"# 爬蟲記錄\n\n")
|
|||
|
|
log.write(f"- **起始 URL**:{start_url}\n")
|
|||
|
|
log.write(f"- **執行時間**:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
|||
|
|
log.write("## 已爬取頁面\n\n")
|
|||
|
|
|
|||
|
|
while queue and len(results) < max_pages:
|
|||
|
|
url, depth = queue.popleft()
|
|||
|
|
|
|||
|
|
if url in visited:
|
|||
|
|
continue
|
|||
|
|
visited.add(url)
|
|||
|
|
|
|||
|
|
if should_skip(url, exclude_patterns):
|
|||
|
|
print(f" ⏭️ 跳過(排除規則):{url}")
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
if not is_same_domain(url, start_url):
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
if same_path and not is_same_path_prefix(url, start_url):
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 抓取
|
|||
|
|
print(f" {' ' * depth}📄 [深度{depth}] {url}")
|
|||
|
|
html, title = fetch_page(url, session)
|
|||
|
|
|
|||
|
|
if not html:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 轉換並儲存
|
|||
|
|
filename = url_to_filename(url)
|
|||
|
|
output_path = os.path.join(output_dir, filename)
|
|||
|
|
md = html_to_markdown(html, url, title)
|
|||
|
|
|
|||
|
|
with open(output_path, "w", encoding="utf-8") as f:
|
|||
|
|
f.write(md)
|
|||
|
|
|
|||
|
|
results[url] = output_path
|
|||
|
|
lines = md.count("\n")
|
|||
|
|
print(f" {' ' * depth} ✅ → {filename}({lines} 行)")
|
|||
|
|
log.write(f"| {url} | [{filename}](./{filename}) | {title} |\n")
|
|||
|
|
|
|||
|
|
# 如果還沒到最大深度,繼續追蹤連結
|
|||
|
|
if depth < max_depth:
|
|||
|
|
links = extract_links(html, url)
|
|||
|
|
for link in links:
|
|||
|
|
if link not in visited and not should_skip(link, exclude_patterns):
|
|||
|
|
queue.append((link, depth + 1))
|
|||
|
|
|
|||
|
|
if queue and delay > 0:
|
|||
|
|
time.sleep(delay)
|
|||
|
|
|
|||
|
|
log.write(f"\n---\n共爬取 **{len(results)}** 頁\n")
|
|||
|
|
|
|||
|
|
print(f"\n✅ 爬取完成:{len(results)} 頁 → {output_dir}")
|
|||
|
|
print(f"📋 爬取記錄:{log_path}")
|
|||
|
|
|
|||
|
|
# 輸出索引
|
|||
|
|
index_path = os.path.join(output_dir, "index.md")
|
|||
|
|
with open(index_path, "w", encoding="utf-8") as f:
|
|||
|
|
f.write(f"# 爬取索引:{start_url}\n\n")
|
|||
|
|
f.write(f"爬取時間:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
|||
|
|
f.write("## 所有頁面\n\n")
|
|||
|
|
for url, path in results.items():
|
|||
|
|
fname = os.path.basename(path)
|
|||
|
|
f.write(f"- [{url}](./{fname})\n")
|
|||
|
|
|
|||
|
|
print(f"📑 索引檔案:{index_path}")
|
|||
|
|
return results
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ── Entry Point ────────────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
parser = argparse.ArgumentParser(
|
|||
|
|
description="將網頁轉換為 Markdown(單頁或爬蟲模式)"
|
|||
|
|
)
|
|||
|
|
parser.add_argument("--url", required=True, help="起始 URL")
|
|||
|
|
parser.add_argument("--crawl", action="store_true", help="啟用爬蟲模式(追蹤同域連結)")
|
|||
|
|
parser.add_argument("--depth", type=int, default=2, help="爬蟲深度(預設 2)")
|
|||
|
|
parser.add_argument("--same-path", action="store_true", help="只爬相同路徑前綴下的頁面")
|
|||
|
|
parser.add_argument("--max-pages", type=int, default=50, help="最多爬取頁數(預設 50)")
|
|||
|
|
parser.add_argument("--delay", type=float, default=1.0, help="請求間隔秒數(預設 1.0)")
|
|||
|
|
parser.add_argument("--output", default="output.md", help="單頁模式輸出路徑(預設 output.md)")
|
|||
|
|
parser.add_argument("--output-dir", default="./scraped/", help="爬蟲模式輸出目錄(預設 ./scraped/)")
|
|||
|
|
parser.add_argument("--exclude", action="append", default=[], help="排除含有此字串的 URL(可多次使用)")
|
|||
|
|
|
|||
|
|
args = parser.parse_args()
|
|||
|
|
|
|||
|
|
# 組合排除規則
|
|||
|
|
exclude_patterns = DEFAULT_EXCLUDE_PATTERNS + args.exclude
|
|||
|
|
|
|||
|
|
# 建立 Session
|
|||
|
|
session = requests.Session()
|
|||
|
|
session.headers.update(DEFAULT_HEADERS)
|
|||
|
|
|
|||
|
|
if args.crawl:
|
|||
|
|
crawl(
|
|||
|
|
start_url=args.url,
|
|||
|
|
output_dir=args.output_dir,
|
|||
|
|
max_depth=args.depth,
|
|||
|
|
same_path=args.same_path,
|
|||
|
|
max_pages=args.max_pages,
|
|||
|
|
delay=args.delay,
|
|||
|
|
exclude_patterns=exclude_patterns,
|
|||
|
|
session=session,
|
|||
|
|
)
|
|||
|
|
else:
|
|||
|
|
success = convert_single(args.url, args.output, session)
|
|||
|
|
sys.exit(0 if success else 1)
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|