// SEC 重要申報與財報/法說相關資料:抓取後寫入本機 archive/ + SQLite,避免連結失效 import fs from 'node:fs'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; import { listSecFilings, upsertSecFiling, listEarningsEvents, upsertEarningsEvent, getSecArchiveMeta, saveSecArchiveMeta, } from './db.js'; import { fetchEarningsEvents } from './calendar.js'; import { resolveInvestorRelationsUrl } from './companyintel-links.js'; import { yahooQuoteSummary } from './yahoo-session.js'; const __dirname = path.dirname(fileURLToPath(import.meta.url)); const ARCHIVE_ROOT = path.join(__dirname, '..', 'archive', 'sec'); const SEC_UA = 'EmmyInvestDashboard/1.0 (personal learning tool; contact@example.com)'; const MAX_FILINGS_SYNC = 36; const MAX_FILE_BYTES = 12 * 1024 * 1024; const EXCERPT_LEN = 4000; const IMPORTANT_FORMS = new Set([ '10-K', '10-Q', '8-K', '6-K', '20-F', 'DEF 14A', 'DEFA14A', 'S-1', 'S-3', 'F-1', 'F-3', '424B1', '424B2', '424B3', '424B4', '424B5', 'SC 13D', 'SC 13G', '4', '3', '5', ]); const FORM_ZH = { '10-K': '年報', '10-Q': '季報', '8-K': '重大事件(含財報公告)', '6-K': '外國公司重大事件', '20-F': '外國公司年報', 'DEF 14A': '股東會說明書', 'DEFA14A': '股東會補充', 'S-1': '上市/增資說明', 'S-3': '增資說明', 'F-1': '外國公司上市', 'F-3': '外國公司增資', '4': '內部人交易', '3': '內部人持股', '5': '內部人年度', 'SC 13D': '主動持股申報', 'SC 13G': '被動持股申報', }; function formLabelZh(form) { const base = String(form || '').replace(/\/A$/i, ''); return FORM_ZH[base] || FORM_ZH[form] || form; } function isImportantForm(form) { const f = String(form || '').trim(); if (!f) return false; const base = f.replace(/\/A$/i, ''); if (IMPORTANT_FORMS.has(base) || IMPORTANT_FORMS.has(f)) return true; if (/^424B/i.test(f)) return true; return false; } let _tickerMap = null; async function tickerToCik(symbol) { if (!_tickerMap) { const res = await fetch('https://www.sec.gov/files/company_tickers.json', { headers: { 'User-Agent': SEC_UA } }); if (!res.ok) throw new Error(`SEC tickers HTTP ${res.status}`); const d = await res.json(); _tickerMap = {}; for (const k of Object.keys(d)) { _tickerMap[String(d[k].ticker).toUpperCase()] = { cik: String(d[k].cik_str).padStart(10, '0'), name: d[k].title, }; } } return _tickerMap[symbol] || null; } async function text(url, ms = 20000) { const ctrl = new AbortController(); const timer = setTimeout(() => ctrl.abort(), ms); try { const res = await fetch(url, { headers: { 'User-Agent': SEC_UA }, signal: ctrl.signal }); if (!res.ok) throw new Error(`HTTP ${res.status}`); return await res.text(); } finally { clearTimeout(timer); } } async function json(url, ms = 15000) { const res = await fetch(url, { headers: { 'User-Agent': SEC_UA, Accept: 'application/json' }, signal: AbortSignal.timeout(ms) }); if (!res.ok) throw new Error(`HTTP ${res.status}`); return res.json(); } function accNoDash(accn) { return String(accn || '').replace(/-/g, ''); } function filingDir(symbol, accn) { return path.join(ARCHIVE_ROOT, symbol, accNoDash(accn)); } function edgarPrimaryUrl(cikNum, accn, primary) { return `https://www.sec.gov/Archives/edgar/data/${cikNum}/${accNoDash(accn)}/${primary}`; } function edgarTxtUrl(cikNum, accn) { return `https://www.sec.gov/Archives/edgar/data/${cikNum}/${accNoDash(accn)}/${accn}.txt`; } function edgarIndexJsonUrl(cikNum, accn) { return `https://www.sec.gov/Archives/edgar/data/${cikNum}/${accNoDash(accn)}/${accn}-index.json`; } function ensureDir(dir) { fs.mkdirSync(dir, { recursive: true }); } function writeIfAbsent(filePath, content) { if (fs.existsSync(filePath)) return false; fs.writeFileSync(filePath, content); return true; } function excerptFromHtml(html) { const plain = String(html || '') .replace(//gi, '') .replace(//gi, '') .replace(/<[^>]+>/g, ' ') .replace(/&#\d+;/g, ' ') .replace(/\s+/g, ' ') .trim(); return plain.slice(0, EXCERPT_LEN); } function isEarnings8k(txt, description) { const blob = `${description || ''}\n${txt || ''}`.slice(0, 120000); return /Item\s+2\.02/i.test(blob) || /Results of Operations and Financial Condition/i.test(blob) || /財報|earnings release|quarterly results/i.test(blob); } function collectFilingsFromSubmissions(sub, symbol) { const f = sub.filings?.recent || {}; const out = []; const forms = f.form || []; for (let i = 0; i < forms.length && out.length < MAX_FILINGS_SYNC * 2; i++) { const form = forms[i]; if (!isImportantForm(form)) continue; const accn = f.accessionNumber[i]; if (!accn) continue; out.push({ symbol, accession: accn, form, formZh: formLabelZh(form), filedDate: f.filingDate[i] || null, reportDate: f.reportDate?.[i] || null, primaryDocument: f.primaryDocument?.[i] || null, description: f.primaryDocDescription?.[i] || f.description?.[i] || '', isEarningsRelated: form.replace(/\/A$/i, '') === '8-K', }); if (out.length >= MAX_FILINGS_SYNC) break; } return out; } async function downloadToFile(url, destPath) { const ctrl = new AbortController(); const timer = setTimeout(() => ctrl.abort(), 45000); try { const res = await fetch(url, { headers: { 'User-Agent': SEC_UA }, signal: ctrl.signal }); if (!res.ok) throw new Error(`HTTP ${res.status}`); const buf = Buffer.from(await res.arrayBuffer()); if (buf.length > MAX_FILE_BYTES) return { skipped: true, reason: 'too_large', size: buf.length }; ensureDir(path.dirname(destPath)); fs.writeFileSync(destPath, buf); return { skipped: false, size: buf.length }; } finally { clearTimeout(timer); } } async function archiveFiling(meta, cikNum) { const { symbol, accession, primaryDocument } = meta; const dir = filingDir(symbol, accession); const metaPath = path.join(dir, 'meta.json'); if (fs.existsSync(metaPath)) { try { const prev = JSON.parse(fs.readFileSync(metaPath, 'utf8')); if (prev.localPrimary) { return { ...meta, localPrimary: prev.localPrimary, localTxt: prev.localTxt || null, excerpt: prev.excerpt || null, archived: true, reused: true }; } } catch { /* re-download */ } } ensureDir(dir); const files = { localPrimary: null, localTxt: null, excerpt: null, archived: false, reused: false }; const primary = primaryDocument || `${accession}.txt`; if (primary && !primary.endsWith('.txt')) { const url = edgarPrimaryUrl(cikNum, accession, primary); const ext = path.extname(primary) || '.htm'; const dest = path.join(dir, `primary${ext}`); try { const r = await downloadToFile(url, dest); if (!r.skipped) { files.localPrimary = path.relative(path.join(__dirname, '..'), dest); const html = fs.readFileSync(dest, 'utf8'); files.excerpt = excerptFromHtml(html); files.archived = true; } } catch { /* metadata only */ } } const txtUrl = edgarTxtUrl(cikNum, accession); const txtDest = path.join(dir, 'filing.txt'); try { const r = await downloadToFile(txtUrl, txtDest); if (!r.skipped) { files.localTxt = path.relative(path.join(__dirname, '..'), txtDest); files.archived = true; if (!files.excerpt) { const raw = fs.readFileSync(txtDest, 'utf8').slice(0, 80000); files.excerpt = excerptFromHtml(raw); } } } catch { /* ok */ } const earningsExhibits = []; if (meta.isEarningsRelated) { try { const idx = await json(edgarIndexJsonUrl(cikNum, accession)); const items = idx.directory?.item || []; for (const it of items) { const name = String(it.name || ''); const desc = String(it.description || ''); if (!/ex-99|press release|earnings/i.test(name + desc)) continue; if (!/\.htm|\.html|\.txt$/i.test(name)) continue; const exUrl = edgarPrimaryUrl(cikNum, accession, name); const exDest = path.join(dir, name.replace(/[^\w.\-]+/g, '_')); try { const r = await downloadToFile(exUrl, exDest); if (!r.skipped) { earningsExhibits.push({ name, description: desc, localPath: path.relative(path.join(__dirname, '..'), exDest), url: exUrl, }); } } catch { /* skip exhibit */ } } } catch { /* no index */ } } const fullMeta = { ...meta, ...files, earningsExhibits, edgarUrl: `https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=${cikNum}&type=${encodeURIComponent(meta.form)}&dateb=&owner=include&count=40`, archivedAt: new Date().toISOString(), }; fs.writeFileSync(metaPath, JSON.stringify(fullMeta, null, 2)); return fullMeta; } async function syncEarningsCalendar(symbol) { const today = new Date(); const start = new Date(today); start.setUTCDate(start.getUTCDate() - 400); const end = new Date(today); end.setUTCDate(end.getUTCDate() + 120); const startISO = start.toISOString().slice(0, 10); const endISO = end.toISOString().slice(0, 10); const events = await fetchEarningsEvents(startISO, endISO, [symbol]); let n = 0; for (const ev of events) { upsertEarningsEvent({ symbol, eventDate: ev.date, title: ev.title, titleZh: ev.title, timeLabel: ev.time || '', source: ev.source || 'Nasdaq earnings', url: ev.url, note: ev.note || '', kind: 'earnings_calendar', }); n++; } return n; } export async function syncSecArchive(symbol, { force = false } = {}) { symbol = String(symbol || '').trim().toUpperCase(); if (!symbol) throw new Error('bad_symbol'); const hit = await tickerToCik(symbol); if (!hit) throw new Error('cik_not_found'); const meta0 = getSecArchiveMeta(symbol); const softMs = (Number(process.env.SEC_ARCHIVE_SOFT_HOURS) || 12) * 3600 * 1000; if (!force && meta0?.lastSyncAt && Date.now() - meta0.lastSyncAt < softMs) { return { symbol, skipped: true, filings: listSecFilings(symbol), earnings: listEarningsEvents(symbol), meta: meta0, }; } const sub = await json(`https://data.sec.gov/submissions/CIK${hit.cik}.json`); const cikNum = Number(hit.cik); let investorUrl = null; try { const y = await yahooQuoteSummary(symbol, 'assetProfile'); investorUrl = resolveInvestorRelationsUrl(y?.assetProfile?.website)?.url || null; } catch { /* */ } const candidates = collectFilingsFromSubmissions(sub, symbol); const synced = []; let downloaded = 0; for (const row of candidates) { let archived = null; try { archived = await archiveFiling({ ...row, cik: hit.cik, companyName: hit.name }, cikNum); if (archived.archived && !archived.reused) downloaded++; } catch { archived = { ...row, archived: false }; } let earningsFlag = row.isEarningsRelated; let excerpt = archived?.excerpt || null; if (earningsFlag && archived?.localTxt) { try { const txt = fs.readFileSync(path.join(__dirname, '..', archived.localTxt), 'utf8'); earningsFlag = isEarnings8k(txt, row.description); if (earningsFlag && !excerpt) excerpt = excerptFromHtml(txt); } catch { /* */ } } else if (archived?.localTxt) { try { const txt = fs.readFileSync(path.join(__dirname, '..', archived.localTxt), 'utf8').slice(0, 50000); if (isEarnings8k(txt, row.description)) earningsFlag = true; } catch { /* */ } } upsertSecFiling({ symbol, accession: row.accession, form: row.form, formZh: row.formZh, filedDate: row.filedDate, reportDate: row.reportDate, description: row.description, primaryDocument: row.primaryDocument, url: row.primaryDocument ? edgarPrimaryUrl(cikNum, row.accession, row.primaryDocument) : edgarTxtUrl(cikNum, row.accession), localPrimary: archived?.localPrimary || null, localTxt: archived?.localTxt || null, excerpt, isEarningsRelated: earningsFlag ? 1 : 0, earningsExhibits: archived?.earningsExhibits ? JSON.stringify(archived.earningsExhibits) : null, }); if (earningsFlag) { upsertEarningsEvent({ symbol, eventDate: row.reportDate || row.filedDate, title: `${symbol} 財報/重大事件 8-K`, titleZh: `${symbol} 財報公告(8-K Item 2.02)`, timeLabel: '', source: 'SEC 8-K', url: archived?.localPrimary ? null : (row.primaryDocument ? edgarPrimaryUrl(cikNum, row.accession, row.primaryDocument) : edgarTxtUrl(cikNum, row.accession)), note: row.description || '已封存申報全文;法說逐字稿多由公司投資人關係頁發布', kind: 'sec_8k', accession: row.accession, transcriptSearchUrl: investorUrl, }); } synced.push({ ...row, archived: !!archived?.archived, localPrimary: archived?.localPrimary, isEarningsRelated: earningsFlag, }); } const earnN = await syncEarningsCalendar(symbol).catch(() => 0); const meta = { symbol, companyName: hit.name, cik: hit.cik, lastSyncAt: Date.now(), filingCount: listSecFilings(symbol).length, earningsCount: listEarningsEvents(symbol).length, downloadedThisRun: downloaded, earningsCalendarSynced: earnN, }; saveSecArchiveMeta(symbol, meta); return { symbol, skipped: false, filings: listSecFilings(symbol), earnings: listEarningsEvents(symbol), meta, synced, }; } export function getSecArchivePayload(symbol) { symbol = String(symbol || '').trim().toUpperCase(); return { symbol, filings: listSecFilings(symbol), earnings: listEarningsEvents(symbol), meta: getSecArchiveMeta(symbol), }; } export function resolveArchiveFile(symbol, accession, file) { symbol = String(symbol || '').trim().toUpperCase(); const dir = filingDir(symbol, accession); if (!fs.existsSync(dir)) return null; const safe = path.basename(String(file || 'primary.htm')); const full = path.join(dir, safe); if (!full.startsWith(dir)) return null; if (!fs.existsSync(full)) { const metaPath = path.join(dir, 'meta.json'); if (fs.existsSync(metaPath)) { try { const m = JSON.parse(fs.readFileSync(metaPath, 'utf8')); if (m.localPrimary) { const p = path.join(__dirname, '..', m.localPrimary); if (fs.existsSync(p)) return p; } if (m.localTxt) { const p = path.join(__dirname, '..', m.localTxt); if (fs.existsSync(p)) return p; } } catch { /* */ } } return null; } return full; }