finance-dashboard/lib/sec-archive.js

433 lines
15 KiB
JavaScript
Raw Permalink Normal View History

2026-06-04 09:32:28 +00:00
// SEC 重要申報與財報/法說相關資料:抓取後寫入本機 archive/ + SQLite避免連結失效
import fs from 'node:fs';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
import {
listSecFilings, upsertSecFiling, listEarningsEvents, upsertEarningsEvent,
getSecArchiveMeta, saveSecArchiveMeta,
} from './db.js';
import { fetchEarningsEvents } from './calendar.js';
import { resolveInvestorRelationsUrl } from './companyintel-links.js';
import { yahooQuoteSummary } from './yahoo-session.js';
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const ARCHIVE_ROOT = path.join(__dirname, '..', 'archive', 'sec');
const SEC_UA = 'EmmyInvestDashboard/1.0 (personal learning tool; contact@example.com)';
const MAX_FILINGS_SYNC = 36;
const MAX_FILE_BYTES = 12 * 1024 * 1024;
const EXCERPT_LEN = 4000;
const IMPORTANT_FORMS = new Set([
'10-K', '10-Q', '8-K', '6-K', '20-F', 'DEF 14A', 'DEFA14A', 'S-1', 'S-3', 'F-1', 'F-3',
'424B1', '424B2', '424B3', '424B4', '424B5', 'SC 13D', 'SC 13G', '4', '3', '5',
]);
const FORM_ZH = {
'10-K': '年報', '10-Q': '季報', '8-K': '重大事件(含財報公告)', '6-K': '外國公司重大事件',
'20-F': '外國公司年報', 'DEF 14A': '股東會說明書', 'DEFA14A': '股東會補充', 'S-1': '上市/增資說明',
'S-3': '增資說明', 'F-1': '外國公司上市', 'F-3': '外國公司增資', '4': '內部人交易',
'3': '內部人持股', '5': '內部人年度', 'SC 13D': '主動持股申報', 'SC 13G': '被動持股申報',
};
function formLabelZh(form) {
const base = String(form || '').replace(/\/A$/i, '');
return FORM_ZH[base] || FORM_ZH[form] || form;
}
function isImportantForm(form) {
const f = String(form || '').trim();
if (!f) return false;
const base = f.replace(/\/A$/i, '');
if (IMPORTANT_FORMS.has(base) || IMPORTANT_FORMS.has(f)) return true;
if (/^424B/i.test(f)) return true;
return false;
}
let _tickerMap = null;
async function tickerToCik(symbol) {
if (!_tickerMap) {
const res = await fetch('https://www.sec.gov/files/company_tickers.json', { headers: { 'User-Agent': SEC_UA } });
if (!res.ok) throw new Error(`SEC tickers HTTP ${res.status}`);
const d = await res.json();
_tickerMap = {};
for (const k of Object.keys(d)) {
_tickerMap[String(d[k].ticker).toUpperCase()] = {
cik: String(d[k].cik_str).padStart(10, '0'),
name: d[k].title,
};
}
}
return _tickerMap[symbol] || null;
}
async function text(url, ms = 20000) {
const ctrl = new AbortController();
const timer = setTimeout(() => ctrl.abort(), ms);
try {
const res = await fetch(url, { headers: { 'User-Agent': SEC_UA }, signal: ctrl.signal });
if (!res.ok) throw new Error(`HTTP ${res.status}`);
return await res.text();
} finally { clearTimeout(timer); }
}
async function json(url, ms = 15000) {
const res = await fetch(url, { headers: { 'User-Agent': SEC_UA, Accept: 'application/json' }, signal: AbortSignal.timeout(ms) });
if (!res.ok) throw new Error(`HTTP ${res.status}`);
return res.json();
}
function accNoDash(accn) {
return String(accn || '').replace(/-/g, '');
}
function filingDir(symbol, accn) {
return path.join(ARCHIVE_ROOT, symbol, accNoDash(accn));
}
function edgarPrimaryUrl(cikNum, accn, primary) {
return `https://www.sec.gov/Archives/edgar/data/${cikNum}/${accNoDash(accn)}/${primary}`;
}
function edgarTxtUrl(cikNum, accn) {
return `https://www.sec.gov/Archives/edgar/data/${cikNum}/${accNoDash(accn)}/${accn}.txt`;
}
function edgarIndexJsonUrl(cikNum, accn) {
return `https://www.sec.gov/Archives/edgar/data/${cikNum}/${accNoDash(accn)}/${accn}-index.json`;
}
function ensureDir(dir) {
fs.mkdirSync(dir, { recursive: true });
}
function writeIfAbsent(filePath, content) {
if (fs.existsSync(filePath)) return false;
fs.writeFileSync(filePath, content);
return true;
}
function excerptFromHtml(html) {
const plain = String(html || '')
.replace(/<script[\s\S]*?<\/script>/gi, '')
.replace(/<style[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/&#\d+;/g, ' ')
.replace(/\s+/g, ' ')
.trim();
return plain.slice(0, EXCERPT_LEN);
}
function isEarnings8k(txt, description) {
const blob = `${description || ''}\n${txt || ''}`.slice(0, 120000);
return /Item\s+2\.02/i.test(blob) || /Results of Operations and Financial Condition/i.test(blob)
|| /財報|earnings release|quarterly results/i.test(blob);
}
function collectFilingsFromSubmissions(sub, symbol) {
const f = sub.filings?.recent || {};
const out = [];
const forms = f.form || [];
for (let i = 0; i < forms.length && out.length < MAX_FILINGS_SYNC * 2; i++) {
const form = forms[i];
if (!isImportantForm(form)) continue;
const accn = f.accessionNumber[i];
if (!accn) continue;
out.push({
symbol,
accession: accn,
form,
formZh: formLabelZh(form),
filedDate: f.filingDate[i] || null,
reportDate: f.reportDate?.[i] || null,
primaryDocument: f.primaryDocument?.[i] || null,
description: f.primaryDocDescription?.[i] || f.description?.[i] || '',
isEarningsRelated: form.replace(/\/A$/i, '') === '8-K',
});
if (out.length >= MAX_FILINGS_SYNC) break;
}
return out;
}
async function downloadToFile(url, destPath) {
const ctrl = new AbortController();
const timer = setTimeout(() => ctrl.abort(), 45000);
try {
const res = await fetch(url, { headers: { 'User-Agent': SEC_UA }, signal: ctrl.signal });
if (!res.ok) throw new Error(`HTTP ${res.status}`);
const buf = Buffer.from(await res.arrayBuffer());
if (buf.length > MAX_FILE_BYTES) return { skipped: true, reason: 'too_large', size: buf.length };
ensureDir(path.dirname(destPath));
fs.writeFileSync(destPath, buf);
return { skipped: false, size: buf.length };
} finally { clearTimeout(timer); }
}
async function archiveFiling(meta, cikNum) {
const { symbol, accession, primaryDocument } = meta;
const dir = filingDir(symbol, accession);
const metaPath = path.join(dir, 'meta.json');
if (fs.existsSync(metaPath)) {
try {
const prev = JSON.parse(fs.readFileSync(metaPath, 'utf8'));
if (prev.localPrimary) {
return { ...meta, localPrimary: prev.localPrimary, localTxt: prev.localTxt || null, excerpt: prev.excerpt || null, archived: true, reused: true };
}
} catch { /* re-download */ }
}
ensureDir(dir);
const files = { localPrimary: null, localTxt: null, excerpt: null, archived: false, reused: false };
const primary = primaryDocument || `${accession}.txt`;
if (primary && !primary.endsWith('.txt')) {
const url = edgarPrimaryUrl(cikNum, accession, primary);
const ext = path.extname(primary) || '.htm';
const dest = path.join(dir, `primary${ext}`);
try {
const r = await downloadToFile(url, dest);
if (!r.skipped) {
files.localPrimary = path.relative(path.join(__dirname, '..'), dest);
const html = fs.readFileSync(dest, 'utf8');
files.excerpt = excerptFromHtml(html);
files.archived = true;
}
} catch { /* metadata only */ }
}
const txtUrl = edgarTxtUrl(cikNum, accession);
const txtDest = path.join(dir, 'filing.txt');
try {
const r = await downloadToFile(txtUrl, txtDest);
if (!r.skipped) {
files.localTxt = path.relative(path.join(__dirname, '..'), txtDest);
files.archived = true;
if (!files.excerpt) {
const raw = fs.readFileSync(txtDest, 'utf8').slice(0, 80000);
files.excerpt = excerptFromHtml(raw);
}
}
} catch { /* ok */ }
const earningsExhibits = [];
if (meta.isEarningsRelated) {
try {
const idx = await json(edgarIndexJsonUrl(cikNum, accession));
const items = idx.directory?.item || [];
for (const it of items) {
const name = String(it.name || '');
const desc = String(it.description || '');
if (!/ex-99|press release|earnings/i.test(name + desc)) continue;
if (!/\.htm|\.html|\.txt$/i.test(name)) continue;
const exUrl = edgarPrimaryUrl(cikNum, accession, name);
const exDest = path.join(dir, name.replace(/[^\w.\-]+/g, '_'));
try {
const r = await downloadToFile(exUrl, exDest);
if (!r.skipped) {
earningsExhibits.push({
name,
description: desc,
localPath: path.relative(path.join(__dirname, '..'), exDest),
url: exUrl,
});
}
} catch { /* skip exhibit */ }
}
} catch { /* no index */ }
}
const fullMeta = {
...meta,
...files,
earningsExhibits,
edgarUrl: `https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=${cikNum}&type=${encodeURIComponent(meta.form)}&dateb=&owner=include&count=40`,
archivedAt: new Date().toISOString(),
};
fs.writeFileSync(metaPath, JSON.stringify(fullMeta, null, 2));
return fullMeta;
}
async function syncEarningsCalendar(symbol) {
const today = new Date();
const start = new Date(today);
start.setUTCDate(start.getUTCDate() - 400);
const end = new Date(today);
end.setUTCDate(end.getUTCDate() + 120);
const startISO = start.toISOString().slice(0, 10);
const endISO = end.toISOString().slice(0, 10);
const events = await fetchEarningsEvents(startISO, endISO, [symbol]);
let n = 0;
for (const ev of events) {
upsertEarningsEvent({
symbol,
eventDate: ev.date,
title: ev.title,
titleZh: ev.title,
timeLabel: ev.time || '',
source: ev.source || 'Nasdaq earnings',
url: ev.url,
note: ev.note || '',
kind: 'earnings_calendar',
});
n++;
}
return n;
}
export async function syncSecArchive(symbol, { force = false } = {}) {
symbol = String(symbol || '').trim().toUpperCase();
if (!symbol) throw new Error('bad_symbol');
const hit = await tickerToCik(symbol);
if (!hit) throw new Error('cik_not_found');
const meta0 = getSecArchiveMeta(symbol);
const softMs = (Number(process.env.SEC_ARCHIVE_SOFT_HOURS) || 12) * 3600 * 1000;
if (!force && meta0?.lastSyncAt && Date.now() - meta0.lastSyncAt < softMs) {
return {
symbol,
skipped: true,
filings: listSecFilings(symbol),
earnings: listEarningsEvents(symbol),
meta: meta0,
};
}
const sub = await json(`https://data.sec.gov/submissions/CIK${hit.cik}.json`);
const cikNum = Number(hit.cik);
let investorUrl = null;
try {
const y = await yahooQuoteSummary(symbol, 'assetProfile');
investorUrl = resolveInvestorRelationsUrl(y?.assetProfile?.website)?.url || null;
} catch { /* */ }
const candidates = collectFilingsFromSubmissions(sub, symbol);
const synced = [];
let downloaded = 0;
for (const row of candidates) {
let archived = null;
try {
archived = await archiveFiling({ ...row, cik: hit.cik, companyName: hit.name }, cikNum);
if (archived.archived && !archived.reused) downloaded++;
} catch {
archived = { ...row, archived: false };
}
let earningsFlag = row.isEarningsRelated;
let excerpt = archived?.excerpt || null;
if (earningsFlag && archived?.localTxt) {
try {
const txt = fs.readFileSync(path.join(__dirname, '..', archived.localTxt), 'utf8');
earningsFlag = isEarnings8k(txt, row.description);
if (earningsFlag && !excerpt) excerpt = excerptFromHtml(txt);
} catch { /* */ }
} else if (archived?.localTxt) {
try {
const txt = fs.readFileSync(path.join(__dirname, '..', archived.localTxt), 'utf8').slice(0, 50000);
if (isEarnings8k(txt, row.description)) earningsFlag = true;
} catch { /* */ }
}
upsertSecFiling({
symbol,
accession: row.accession,
form: row.form,
formZh: row.formZh,
filedDate: row.filedDate,
reportDate: row.reportDate,
description: row.description,
primaryDocument: row.primaryDocument,
url: row.primaryDocument
? edgarPrimaryUrl(cikNum, row.accession, row.primaryDocument)
: edgarTxtUrl(cikNum, row.accession),
localPrimary: archived?.localPrimary || null,
localTxt: archived?.localTxt || null,
excerpt,
isEarningsRelated: earningsFlag ? 1 : 0,
earningsExhibits: archived?.earningsExhibits ? JSON.stringify(archived.earningsExhibits) : null,
});
if (earningsFlag) {
upsertEarningsEvent({
symbol,
eventDate: row.reportDate || row.filedDate,
title: `${symbol} 財報/重大事件 8-K`,
titleZh: `${symbol} 財報公告8-K Item 2.02`,
timeLabel: '',
source: 'SEC 8-K',
url: archived?.localPrimary
? null
: (row.primaryDocument ? edgarPrimaryUrl(cikNum, row.accession, row.primaryDocument) : edgarTxtUrl(cikNum, row.accession)),
note: row.description || '已封存申報全文;法說逐字稿多由公司投資人關係頁發布',
kind: 'sec_8k',
accession: row.accession,
transcriptSearchUrl: investorUrl,
});
}
synced.push({
...row,
archived: !!archived?.archived,
localPrimary: archived?.localPrimary,
isEarningsRelated: earningsFlag,
});
}
const earnN = await syncEarningsCalendar(symbol).catch(() => 0);
const meta = {
symbol,
companyName: hit.name,
cik: hit.cik,
lastSyncAt: Date.now(),
filingCount: listSecFilings(symbol).length,
earningsCount: listEarningsEvents(symbol).length,
downloadedThisRun: downloaded,
earningsCalendarSynced: earnN,
};
saveSecArchiveMeta(symbol, meta);
return {
symbol,
skipped: false,
filings: listSecFilings(symbol),
earnings: listEarningsEvents(symbol),
meta,
synced,
};
}
export function getSecArchivePayload(symbol) {
symbol = String(symbol || '').trim().toUpperCase();
return {
symbol,
filings: listSecFilings(symbol),
earnings: listEarningsEvents(symbol),
meta: getSecArchiveMeta(symbol),
};
}
export function resolveArchiveFile(symbol, accession, file) {
symbol = String(symbol || '').trim().toUpperCase();
const dir = filingDir(symbol, accession);
if (!fs.existsSync(dir)) return null;
const safe = path.basename(String(file || 'primary.htm'));
const full = path.join(dir, safe);
if (!full.startsWith(dir)) return null;
if (!fs.existsSync(full)) {
const metaPath = path.join(dir, 'meta.json');
if (fs.existsSync(metaPath)) {
try {
const m = JSON.parse(fs.readFileSync(metaPath, 'utf8'));
if (m.localPrimary) {
const p = path.join(__dirname, '..', m.localPrimary);
if (fs.existsSync(p)) return p;
}
if (m.localTxt) {
const p = path.join(__dirname, '..', m.localTxt);
if (fs.existsSync(p)) return p;
}
} catch { /* */ }
}
return null;
}
return full;
}