finance-dashboard/lib/sec-archive.js

433 lines
15 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// SEC 重要申報與財報/法說相關資料:抓取後寫入本機 archive/ + SQLite避免連結失效
import fs from 'node:fs';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
import {
listSecFilings, upsertSecFiling, listEarningsEvents, upsertEarningsEvent,
getSecArchiveMeta, saveSecArchiveMeta,
} from './db.js';
import { fetchEarningsEvents } from './calendar.js';
import { resolveInvestorRelationsUrl } from './companyintel-links.js';
import { yahooQuoteSummary } from './yahoo-session.js';
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const ARCHIVE_ROOT = path.join(__dirname, '..', 'archive', 'sec');
const SEC_UA = 'EmmyInvestDashboard/1.0 (personal learning tool; contact@example.com)';
const MAX_FILINGS_SYNC = 36;
const MAX_FILE_BYTES = 12 * 1024 * 1024;
const EXCERPT_LEN = 4000;
const IMPORTANT_FORMS = new Set([
'10-K', '10-Q', '8-K', '6-K', '20-F', 'DEF 14A', 'DEFA14A', 'S-1', 'S-3', 'F-1', 'F-3',
'424B1', '424B2', '424B3', '424B4', '424B5', 'SC 13D', 'SC 13G', '4', '3', '5',
]);
const FORM_ZH = {
'10-K': '年報', '10-Q': '季報', '8-K': '重大事件(含財報公告)', '6-K': '外國公司重大事件',
'20-F': '外國公司年報', 'DEF 14A': '股東會說明書', 'DEFA14A': '股東會補充', 'S-1': '上市/增資說明',
'S-3': '增資說明', 'F-1': '外國公司上市', 'F-3': '外國公司增資', '4': '內部人交易',
'3': '內部人持股', '5': '內部人年度', 'SC 13D': '主動持股申報', 'SC 13G': '被動持股申報',
};
function formLabelZh(form) {
const base = String(form || '').replace(/\/A$/i, '');
return FORM_ZH[base] || FORM_ZH[form] || form;
}
function isImportantForm(form) {
const f = String(form || '').trim();
if (!f) return false;
const base = f.replace(/\/A$/i, '');
if (IMPORTANT_FORMS.has(base) || IMPORTANT_FORMS.has(f)) return true;
if (/^424B/i.test(f)) return true;
return false;
}
let _tickerMap = null;
async function tickerToCik(symbol) {
if (!_tickerMap) {
const res = await fetch('https://www.sec.gov/files/company_tickers.json', { headers: { 'User-Agent': SEC_UA } });
if (!res.ok) throw new Error(`SEC tickers HTTP ${res.status}`);
const d = await res.json();
_tickerMap = {};
for (const k of Object.keys(d)) {
_tickerMap[String(d[k].ticker).toUpperCase()] = {
cik: String(d[k].cik_str).padStart(10, '0'),
name: d[k].title,
};
}
}
return _tickerMap[symbol] || null;
}
async function text(url, ms = 20000) {
const ctrl = new AbortController();
const timer = setTimeout(() => ctrl.abort(), ms);
try {
const res = await fetch(url, { headers: { 'User-Agent': SEC_UA }, signal: ctrl.signal });
if (!res.ok) throw new Error(`HTTP ${res.status}`);
return await res.text();
} finally { clearTimeout(timer); }
}
async function json(url, ms = 15000) {
const res = await fetch(url, { headers: { 'User-Agent': SEC_UA, Accept: 'application/json' }, signal: AbortSignal.timeout(ms) });
if (!res.ok) throw new Error(`HTTP ${res.status}`);
return res.json();
}
function accNoDash(accn) {
return String(accn || '').replace(/-/g, '');
}
function filingDir(symbol, accn) {
return path.join(ARCHIVE_ROOT, symbol, accNoDash(accn));
}
function edgarPrimaryUrl(cikNum, accn, primary) {
return `https://www.sec.gov/Archives/edgar/data/${cikNum}/${accNoDash(accn)}/${primary}`;
}
function edgarTxtUrl(cikNum, accn) {
return `https://www.sec.gov/Archives/edgar/data/${cikNum}/${accNoDash(accn)}/${accn}.txt`;
}
function edgarIndexJsonUrl(cikNum, accn) {
return `https://www.sec.gov/Archives/edgar/data/${cikNum}/${accNoDash(accn)}/${accn}-index.json`;
}
function ensureDir(dir) {
fs.mkdirSync(dir, { recursive: true });
}
function writeIfAbsent(filePath, content) {
if (fs.existsSync(filePath)) return false;
fs.writeFileSync(filePath, content);
return true;
}
function excerptFromHtml(html) {
const plain = String(html || '')
.replace(/<script[\s\S]*?<\/script>/gi, '')
.replace(/<style[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/&#\d+;/g, ' ')
.replace(/\s+/g, ' ')
.trim();
return plain.slice(0, EXCERPT_LEN);
}
function isEarnings8k(txt, description) {
const blob = `${description || ''}\n${txt || ''}`.slice(0, 120000);
return /Item\s+2\.02/i.test(blob) || /Results of Operations and Financial Condition/i.test(blob)
|| /財報|earnings release|quarterly results/i.test(blob);
}
function collectFilingsFromSubmissions(sub, symbol) {
const f = sub.filings?.recent || {};
const out = [];
const forms = f.form || [];
for (let i = 0; i < forms.length && out.length < MAX_FILINGS_SYNC * 2; i++) {
const form = forms[i];
if (!isImportantForm(form)) continue;
const accn = f.accessionNumber[i];
if (!accn) continue;
out.push({
symbol,
accession: accn,
form,
formZh: formLabelZh(form),
filedDate: f.filingDate[i] || null,
reportDate: f.reportDate?.[i] || null,
primaryDocument: f.primaryDocument?.[i] || null,
description: f.primaryDocDescription?.[i] || f.description?.[i] || '',
isEarningsRelated: form.replace(/\/A$/i, '') === '8-K',
});
if (out.length >= MAX_FILINGS_SYNC) break;
}
return out;
}
async function downloadToFile(url, destPath) {
const ctrl = new AbortController();
const timer = setTimeout(() => ctrl.abort(), 45000);
try {
const res = await fetch(url, { headers: { 'User-Agent': SEC_UA }, signal: ctrl.signal });
if (!res.ok) throw new Error(`HTTP ${res.status}`);
const buf = Buffer.from(await res.arrayBuffer());
if (buf.length > MAX_FILE_BYTES) return { skipped: true, reason: 'too_large', size: buf.length };
ensureDir(path.dirname(destPath));
fs.writeFileSync(destPath, buf);
return { skipped: false, size: buf.length };
} finally { clearTimeout(timer); }
}
async function archiveFiling(meta, cikNum) {
const { symbol, accession, primaryDocument } = meta;
const dir = filingDir(symbol, accession);
const metaPath = path.join(dir, 'meta.json');
if (fs.existsSync(metaPath)) {
try {
const prev = JSON.parse(fs.readFileSync(metaPath, 'utf8'));
if (prev.localPrimary) {
return { ...meta, localPrimary: prev.localPrimary, localTxt: prev.localTxt || null, excerpt: prev.excerpt || null, archived: true, reused: true };
}
} catch { /* re-download */ }
}
ensureDir(dir);
const files = { localPrimary: null, localTxt: null, excerpt: null, archived: false, reused: false };
const primary = primaryDocument || `${accession}.txt`;
if (primary && !primary.endsWith('.txt')) {
const url = edgarPrimaryUrl(cikNum, accession, primary);
const ext = path.extname(primary) || '.htm';
const dest = path.join(dir, `primary${ext}`);
try {
const r = await downloadToFile(url, dest);
if (!r.skipped) {
files.localPrimary = path.relative(path.join(__dirname, '..'), dest);
const html = fs.readFileSync(dest, 'utf8');
files.excerpt = excerptFromHtml(html);
files.archived = true;
}
} catch { /* metadata only */ }
}
const txtUrl = edgarTxtUrl(cikNum, accession);
const txtDest = path.join(dir, 'filing.txt');
try {
const r = await downloadToFile(txtUrl, txtDest);
if (!r.skipped) {
files.localTxt = path.relative(path.join(__dirname, '..'), txtDest);
files.archived = true;
if (!files.excerpt) {
const raw = fs.readFileSync(txtDest, 'utf8').slice(0, 80000);
files.excerpt = excerptFromHtml(raw);
}
}
} catch { /* ok */ }
const earningsExhibits = [];
if (meta.isEarningsRelated) {
try {
const idx = await json(edgarIndexJsonUrl(cikNum, accession));
const items = idx.directory?.item || [];
for (const it of items) {
const name = String(it.name || '');
const desc = String(it.description || '');
if (!/ex-99|press release|earnings/i.test(name + desc)) continue;
if (!/\.htm|\.html|\.txt$/i.test(name)) continue;
const exUrl = edgarPrimaryUrl(cikNum, accession, name);
const exDest = path.join(dir, name.replace(/[^\w.\-]+/g, '_'));
try {
const r = await downloadToFile(exUrl, exDest);
if (!r.skipped) {
earningsExhibits.push({
name,
description: desc,
localPath: path.relative(path.join(__dirname, '..'), exDest),
url: exUrl,
});
}
} catch { /* skip exhibit */ }
}
} catch { /* no index */ }
}
const fullMeta = {
...meta,
...files,
earningsExhibits,
edgarUrl: `https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=${cikNum}&type=${encodeURIComponent(meta.form)}&dateb=&owner=include&count=40`,
archivedAt: new Date().toISOString(),
};
fs.writeFileSync(metaPath, JSON.stringify(fullMeta, null, 2));
return fullMeta;
}
async function syncEarningsCalendar(symbol) {
const today = new Date();
const start = new Date(today);
start.setUTCDate(start.getUTCDate() - 400);
const end = new Date(today);
end.setUTCDate(end.getUTCDate() + 120);
const startISO = start.toISOString().slice(0, 10);
const endISO = end.toISOString().slice(0, 10);
const events = await fetchEarningsEvents(startISO, endISO, [symbol]);
let n = 0;
for (const ev of events) {
upsertEarningsEvent({
symbol,
eventDate: ev.date,
title: ev.title,
titleZh: ev.title,
timeLabel: ev.time || '',
source: ev.source || 'Nasdaq earnings',
url: ev.url,
note: ev.note || '',
kind: 'earnings_calendar',
});
n++;
}
return n;
}
export async function syncSecArchive(symbol, { force = false } = {}) {
symbol = String(symbol || '').trim().toUpperCase();
if (!symbol) throw new Error('bad_symbol');
const hit = await tickerToCik(symbol);
if (!hit) throw new Error('cik_not_found');
const meta0 = getSecArchiveMeta(symbol);
const softMs = (Number(process.env.SEC_ARCHIVE_SOFT_HOURS) || 12) * 3600 * 1000;
if (!force && meta0?.lastSyncAt && Date.now() - meta0.lastSyncAt < softMs) {
return {
symbol,
skipped: true,
filings: listSecFilings(symbol),
earnings: listEarningsEvents(symbol),
meta: meta0,
};
}
const sub = await json(`https://data.sec.gov/submissions/CIK${hit.cik}.json`);
const cikNum = Number(hit.cik);
let investorUrl = null;
try {
const y = await yahooQuoteSummary(symbol, 'assetProfile');
investorUrl = resolveInvestorRelationsUrl(y?.assetProfile?.website)?.url || null;
} catch { /* */ }
const candidates = collectFilingsFromSubmissions(sub, symbol);
const synced = [];
let downloaded = 0;
for (const row of candidates) {
let archived = null;
try {
archived = await archiveFiling({ ...row, cik: hit.cik, companyName: hit.name }, cikNum);
if (archived.archived && !archived.reused) downloaded++;
} catch {
archived = { ...row, archived: false };
}
let earningsFlag = row.isEarningsRelated;
let excerpt = archived?.excerpt || null;
if (earningsFlag && archived?.localTxt) {
try {
const txt = fs.readFileSync(path.join(__dirname, '..', archived.localTxt), 'utf8');
earningsFlag = isEarnings8k(txt, row.description);
if (earningsFlag && !excerpt) excerpt = excerptFromHtml(txt);
} catch { /* */ }
} else if (archived?.localTxt) {
try {
const txt = fs.readFileSync(path.join(__dirname, '..', archived.localTxt), 'utf8').slice(0, 50000);
if (isEarnings8k(txt, row.description)) earningsFlag = true;
} catch { /* */ }
}
upsertSecFiling({
symbol,
accession: row.accession,
form: row.form,
formZh: row.formZh,
filedDate: row.filedDate,
reportDate: row.reportDate,
description: row.description,
primaryDocument: row.primaryDocument,
url: row.primaryDocument
? edgarPrimaryUrl(cikNum, row.accession, row.primaryDocument)
: edgarTxtUrl(cikNum, row.accession),
localPrimary: archived?.localPrimary || null,
localTxt: archived?.localTxt || null,
excerpt,
isEarningsRelated: earningsFlag ? 1 : 0,
earningsExhibits: archived?.earningsExhibits ? JSON.stringify(archived.earningsExhibits) : null,
});
if (earningsFlag) {
upsertEarningsEvent({
symbol,
eventDate: row.reportDate || row.filedDate,
title: `${symbol} 財報/重大事件 8-K`,
titleZh: `${symbol} 財報公告8-K Item 2.02`,
timeLabel: '',
source: 'SEC 8-K',
url: archived?.localPrimary
? null
: (row.primaryDocument ? edgarPrimaryUrl(cikNum, row.accession, row.primaryDocument) : edgarTxtUrl(cikNum, row.accession)),
note: row.description || '已封存申報全文;法說逐字稿多由公司投資人關係頁發布',
kind: 'sec_8k',
accession: row.accession,
transcriptSearchUrl: investorUrl,
});
}
synced.push({
...row,
archived: !!archived?.archived,
localPrimary: archived?.localPrimary,
isEarningsRelated: earningsFlag,
});
}
const earnN = await syncEarningsCalendar(symbol).catch(() => 0);
const meta = {
symbol,
companyName: hit.name,
cik: hit.cik,
lastSyncAt: Date.now(),
filingCount: listSecFilings(symbol).length,
earningsCount: listEarningsEvents(symbol).length,
downloadedThisRun: downloaded,
earningsCalendarSynced: earnN,
};
saveSecArchiveMeta(symbol, meta);
return {
symbol,
skipped: false,
filings: listSecFilings(symbol),
earnings: listEarningsEvents(symbol),
meta,
synced,
};
}
export function getSecArchivePayload(symbol) {
symbol = String(symbol || '').trim().toUpperCase();
return {
symbol,
filings: listSecFilings(symbol),
earnings: listEarningsEvents(symbol),
meta: getSecArchiveMeta(symbol),
};
}
export function resolveArchiveFile(symbol, accession, file) {
symbol = String(symbol || '').trim().toUpperCase();
const dir = filingDir(symbol, accession);
if (!fs.existsSync(dir)) return null;
const safe = path.basename(String(file || 'primary.htm'));
const full = path.join(dir, safe);
if (!full.startsWith(dir)) return null;
if (!fs.existsSync(full)) {
const metaPath = path.join(dir, 'meta.json');
if (fs.existsSync(metaPath)) {
try {
const m = JSON.parse(fs.readFileSync(metaPath, 'utf8'));
if (m.localPrimary) {
const p = path.join(__dirname, '..', m.localPrimary);
if (fs.existsSync(p)) return p;
}
if (m.localTxt) {
const p = path.join(__dirname, '..', m.localTxt);
if (fs.existsSync(p)) return p;
}
} catch { /* */ }
}
return null;
}
return full;
}