433 lines
15 KiB
JavaScript
433 lines
15 KiB
JavaScript
// SEC 重要申報與財報/法說相關資料:抓取後寫入本機 archive/ + SQLite,避免連結失效
|
||
import fs from 'node:fs';
|
||
import path from 'node:path';
|
||
import { fileURLToPath } from 'node:url';
|
||
import {
|
||
listSecFilings, upsertSecFiling, listEarningsEvents, upsertEarningsEvent,
|
||
getSecArchiveMeta, saveSecArchiveMeta,
|
||
} from './db.js';
|
||
import { fetchEarningsEvents } from './calendar.js';
|
||
import { resolveInvestorRelationsUrl } from './companyintel-links.js';
|
||
import { yahooQuoteSummary } from './yahoo-session.js';
|
||
|
||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||
const ARCHIVE_ROOT = path.join(__dirname, '..', 'archive', 'sec');
|
||
const SEC_UA = 'EmmyInvestDashboard/1.0 (personal learning tool; contact@example.com)';
|
||
const MAX_FILINGS_SYNC = 36;
|
||
const MAX_FILE_BYTES = 12 * 1024 * 1024;
|
||
const EXCERPT_LEN = 4000;
|
||
|
||
const IMPORTANT_FORMS = new Set([
|
||
'10-K', '10-Q', '8-K', '6-K', '20-F', 'DEF 14A', 'DEFA14A', 'S-1', 'S-3', 'F-1', 'F-3',
|
||
'424B1', '424B2', '424B3', '424B4', '424B5', 'SC 13D', 'SC 13G', '4', '3', '5',
|
||
]);
|
||
|
||
const FORM_ZH = {
|
||
'10-K': '年報', '10-Q': '季報', '8-K': '重大事件(含財報公告)', '6-K': '外國公司重大事件',
|
||
'20-F': '外國公司年報', 'DEF 14A': '股東會說明書', 'DEFA14A': '股東會補充', 'S-1': '上市/增資說明',
|
||
'S-3': '增資說明', 'F-1': '外國公司上市', 'F-3': '外國公司增資', '4': '內部人交易',
|
||
'3': '內部人持股', '5': '內部人年度', 'SC 13D': '主動持股申報', 'SC 13G': '被動持股申報',
|
||
};
|
||
|
||
function formLabelZh(form) {
|
||
const base = String(form || '').replace(/\/A$/i, '');
|
||
return FORM_ZH[base] || FORM_ZH[form] || form;
|
||
}
|
||
|
||
function isImportantForm(form) {
|
||
const f = String(form || '').trim();
|
||
if (!f) return false;
|
||
const base = f.replace(/\/A$/i, '');
|
||
if (IMPORTANT_FORMS.has(base) || IMPORTANT_FORMS.has(f)) return true;
|
||
if (/^424B/i.test(f)) return true;
|
||
return false;
|
||
}
|
||
|
||
let _tickerMap = null;
|
||
async function tickerToCik(symbol) {
|
||
if (!_tickerMap) {
|
||
const res = await fetch('https://www.sec.gov/files/company_tickers.json', { headers: { 'User-Agent': SEC_UA } });
|
||
if (!res.ok) throw new Error(`SEC tickers HTTP ${res.status}`);
|
||
const d = await res.json();
|
||
_tickerMap = {};
|
||
for (const k of Object.keys(d)) {
|
||
_tickerMap[String(d[k].ticker).toUpperCase()] = {
|
||
cik: String(d[k].cik_str).padStart(10, '0'),
|
||
name: d[k].title,
|
||
};
|
||
}
|
||
}
|
||
return _tickerMap[symbol] || null;
|
||
}
|
||
|
||
async function text(url, ms = 20000) {
|
||
const ctrl = new AbortController();
|
||
const timer = setTimeout(() => ctrl.abort(), ms);
|
||
try {
|
||
const res = await fetch(url, { headers: { 'User-Agent': SEC_UA }, signal: ctrl.signal });
|
||
if (!res.ok) throw new Error(`HTTP ${res.status}`);
|
||
return await res.text();
|
||
} finally { clearTimeout(timer); }
|
||
}
|
||
|
||
async function json(url, ms = 15000) {
|
||
const res = await fetch(url, { headers: { 'User-Agent': SEC_UA, Accept: 'application/json' }, signal: AbortSignal.timeout(ms) });
|
||
if (!res.ok) throw new Error(`HTTP ${res.status}`);
|
||
return res.json();
|
||
}
|
||
|
||
function accNoDash(accn) {
|
||
return String(accn || '').replace(/-/g, '');
|
||
}
|
||
|
||
function filingDir(symbol, accn) {
|
||
return path.join(ARCHIVE_ROOT, symbol, accNoDash(accn));
|
||
}
|
||
|
||
function edgarPrimaryUrl(cikNum, accn, primary) {
|
||
return `https://www.sec.gov/Archives/edgar/data/${cikNum}/${accNoDash(accn)}/${primary}`;
|
||
}
|
||
|
||
function edgarTxtUrl(cikNum, accn) {
|
||
return `https://www.sec.gov/Archives/edgar/data/${cikNum}/${accNoDash(accn)}/${accn}.txt`;
|
||
}
|
||
|
||
function edgarIndexJsonUrl(cikNum, accn) {
|
||
return `https://www.sec.gov/Archives/edgar/data/${cikNum}/${accNoDash(accn)}/${accn}-index.json`;
|
||
}
|
||
|
||
function ensureDir(dir) {
|
||
fs.mkdirSync(dir, { recursive: true });
|
||
}
|
||
|
||
function writeIfAbsent(filePath, content) {
|
||
if (fs.existsSync(filePath)) return false;
|
||
fs.writeFileSync(filePath, content);
|
||
return true;
|
||
}
|
||
|
||
function excerptFromHtml(html) {
|
||
const plain = String(html || '')
|
||
.replace(/<script[\s\S]*?<\/script>/gi, '')
|
||
.replace(/<style[\s\S]*?<\/style>/gi, '')
|
||
.replace(/<[^>]+>/g, ' ')
|
||
.replace(/&#\d+;/g, ' ')
|
||
.replace(/\s+/g, ' ')
|
||
.trim();
|
||
return plain.slice(0, EXCERPT_LEN);
|
||
}
|
||
|
||
function isEarnings8k(txt, description) {
|
||
const blob = `${description || ''}\n${txt || ''}`.slice(0, 120000);
|
||
return /Item\s+2\.02/i.test(blob) || /Results of Operations and Financial Condition/i.test(blob)
|
||
|| /財報|earnings release|quarterly results/i.test(blob);
|
||
}
|
||
|
||
function collectFilingsFromSubmissions(sub, symbol) {
|
||
const f = sub.filings?.recent || {};
|
||
const out = [];
|
||
const forms = f.form || [];
|
||
for (let i = 0; i < forms.length && out.length < MAX_FILINGS_SYNC * 2; i++) {
|
||
const form = forms[i];
|
||
if (!isImportantForm(form)) continue;
|
||
const accn = f.accessionNumber[i];
|
||
if (!accn) continue;
|
||
out.push({
|
||
symbol,
|
||
accession: accn,
|
||
form,
|
||
formZh: formLabelZh(form),
|
||
filedDate: f.filingDate[i] || null,
|
||
reportDate: f.reportDate?.[i] || null,
|
||
primaryDocument: f.primaryDocument?.[i] || null,
|
||
description: f.primaryDocDescription?.[i] || f.description?.[i] || '',
|
||
isEarningsRelated: form.replace(/\/A$/i, '') === '8-K',
|
||
});
|
||
if (out.length >= MAX_FILINGS_SYNC) break;
|
||
}
|
||
return out;
|
||
}
|
||
|
||
async function downloadToFile(url, destPath) {
|
||
const ctrl = new AbortController();
|
||
const timer = setTimeout(() => ctrl.abort(), 45000);
|
||
try {
|
||
const res = await fetch(url, { headers: { 'User-Agent': SEC_UA }, signal: ctrl.signal });
|
||
if (!res.ok) throw new Error(`HTTP ${res.status}`);
|
||
const buf = Buffer.from(await res.arrayBuffer());
|
||
if (buf.length > MAX_FILE_BYTES) return { skipped: true, reason: 'too_large', size: buf.length };
|
||
ensureDir(path.dirname(destPath));
|
||
fs.writeFileSync(destPath, buf);
|
||
return { skipped: false, size: buf.length };
|
||
} finally { clearTimeout(timer); }
|
||
}
|
||
|
||
async function archiveFiling(meta, cikNum) {
|
||
const { symbol, accession, primaryDocument } = meta;
|
||
const dir = filingDir(symbol, accession);
|
||
const metaPath = path.join(dir, 'meta.json');
|
||
if (fs.existsSync(metaPath)) {
|
||
try {
|
||
const prev = JSON.parse(fs.readFileSync(metaPath, 'utf8'));
|
||
if (prev.localPrimary) {
|
||
return { ...meta, localPrimary: prev.localPrimary, localTxt: prev.localTxt || null, excerpt: prev.excerpt || null, archived: true, reused: true };
|
||
}
|
||
} catch { /* re-download */ }
|
||
}
|
||
|
||
ensureDir(dir);
|
||
const files = { localPrimary: null, localTxt: null, excerpt: null, archived: false, reused: false };
|
||
const primary = primaryDocument || `${accession}.txt`;
|
||
|
||
if (primary && !primary.endsWith('.txt')) {
|
||
const url = edgarPrimaryUrl(cikNum, accession, primary);
|
||
const ext = path.extname(primary) || '.htm';
|
||
const dest = path.join(dir, `primary${ext}`);
|
||
try {
|
||
const r = await downloadToFile(url, dest);
|
||
if (!r.skipped) {
|
||
files.localPrimary = path.relative(path.join(__dirname, '..'), dest);
|
||
const html = fs.readFileSync(dest, 'utf8');
|
||
files.excerpt = excerptFromHtml(html);
|
||
files.archived = true;
|
||
}
|
||
} catch { /* metadata only */ }
|
||
}
|
||
|
||
const txtUrl = edgarTxtUrl(cikNum, accession);
|
||
const txtDest = path.join(dir, 'filing.txt');
|
||
try {
|
||
const r = await downloadToFile(txtUrl, txtDest);
|
||
if (!r.skipped) {
|
||
files.localTxt = path.relative(path.join(__dirname, '..'), txtDest);
|
||
files.archived = true;
|
||
if (!files.excerpt) {
|
||
const raw = fs.readFileSync(txtDest, 'utf8').slice(0, 80000);
|
||
files.excerpt = excerptFromHtml(raw);
|
||
}
|
||
}
|
||
} catch { /* ok */ }
|
||
|
||
const earningsExhibits = [];
|
||
if (meta.isEarningsRelated) {
|
||
try {
|
||
const idx = await json(edgarIndexJsonUrl(cikNum, accession));
|
||
const items = idx.directory?.item || [];
|
||
for (const it of items) {
|
||
const name = String(it.name || '');
|
||
const desc = String(it.description || '');
|
||
if (!/ex-99|press release|earnings/i.test(name + desc)) continue;
|
||
if (!/\.htm|\.html|\.txt$/i.test(name)) continue;
|
||
const exUrl = edgarPrimaryUrl(cikNum, accession, name);
|
||
const exDest = path.join(dir, name.replace(/[^\w.\-]+/g, '_'));
|
||
try {
|
||
const r = await downloadToFile(exUrl, exDest);
|
||
if (!r.skipped) {
|
||
earningsExhibits.push({
|
||
name,
|
||
description: desc,
|
||
localPath: path.relative(path.join(__dirname, '..'), exDest),
|
||
url: exUrl,
|
||
});
|
||
}
|
||
} catch { /* skip exhibit */ }
|
||
}
|
||
} catch { /* no index */ }
|
||
}
|
||
|
||
const fullMeta = {
|
||
...meta,
|
||
...files,
|
||
earningsExhibits,
|
||
edgarUrl: `https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=${cikNum}&type=${encodeURIComponent(meta.form)}&dateb=&owner=include&count=40`,
|
||
archivedAt: new Date().toISOString(),
|
||
};
|
||
fs.writeFileSync(metaPath, JSON.stringify(fullMeta, null, 2));
|
||
return fullMeta;
|
||
}
|
||
|
||
async function syncEarningsCalendar(symbol) {
|
||
const today = new Date();
|
||
const start = new Date(today);
|
||
start.setUTCDate(start.getUTCDate() - 400);
|
||
const end = new Date(today);
|
||
end.setUTCDate(end.getUTCDate() + 120);
|
||
const startISO = start.toISOString().slice(0, 10);
|
||
const endISO = end.toISOString().slice(0, 10);
|
||
const events = await fetchEarningsEvents(startISO, endISO, [symbol]);
|
||
let n = 0;
|
||
for (const ev of events) {
|
||
upsertEarningsEvent({
|
||
symbol,
|
||
eventDate: ev.date,
|
||
title: ev.title,
|
||
titleZh: ev.title,
|
||
timeLabel: ev.time || '',
|
||
source: ev.source || 'Nasdaq earnings',
|
||
url: ev.url,
|
||
note: ev.note || '',
|
||
kind: 'earnings_calendar',
|
||
});
|
||
n++;
|
||
}
|
||
return n;
|
||
}
|
||
|
||
export async function syncSecArchive(symbol, { force = false } = {}) {
|
||
symbol = String(symbol || '').trim().toUpperCase();
|
||
if (!symbol) throw new Error('bad_symbol');
|
||
const hit = await tickerToCik(symbol);
|
||
if (!hit) throw new Error('cik_not_found');
|
||
|
||
const meta0 = getSecArchiveMeta(symbol);
|
||
const softMs = (Number(process.env.SEC_ARCHIVE_SOFT_HOURS) || 12) * 3600 * 1000;
|
||
if (!force && meta0?.lastSyncAt && Date.now() - meta0.lastSyncAt < softMs) {
|
||
return {
|
||
symbol,
|
||
skipped: true,
|
||
filings: listSecFilings(symbol),
|
||
earnings: listEarningsEvents(symbol),
|
||
meta: meta0,
|
||
};
|
||
}
|
||
|
||
const sub = await json(`https://data.sec.gov/submissions/CIK${hit.cik}.json`);
|
||
const cikNum = Number(hit.cik);
|
||
let investorUrl = null;
|
||
try {
|
||
const y = await yahooQuoteSummary(symbol, 'assetProfile');
|
||
investorUrl = resolveInvestorRelationsUrl(y?.assetProfile?.website)?.url || null;
|
||
} catch { /* */ }
|
||
const candidates = collectFilingsFromSubmissions(sub, symbol);
|
||
const synced = [];
|
||
let downloaded = 0;
|
||
|
||
for (const row of candidates) {
|
||
let archived = null;
|
||
try {
|
||
archived = await archiveFiling({ ...row, cik: hit.cik, companyName: hit.name }, cikNum);
|
||
if (archived.archived && !archived.reused) downloaded++;
|
||
} catch {
|
||
archived = { ...row, archived: false };
|
||
}
|
||
|
||
let earningsFlag = row.isEarningsRelated;
|
||
let excerpt = archived?.excerpt || null;
|
||
if (earningsFlag && archived?.localTxt) {
|
||
try {
|
||
const txt = fs.readFileSync(path.join(__dirname, '..', archived.localTxt), 'utf8');
|
||
earningsFlag = isEarnings8k(txt, row.description);
|
||
if (earningsFlag && !excerpt) excerpt = excerptFromHtml(txt);
|
||
} catch { /* */ }
|
||
} else if (archived?.localTxt) {
|
||
try {
|
||
const txt = fs.readFileSync(path.join(__dirname, '..', archived.localTxt), 'utf8').slice(0, 50000);
|
||
if (isEarnings8k(txt, row.description)) earningsFlag = true;
|
||
} catch { /* */ }
|
||
}
|
||
|
||
upsertSecFiling({
|
||
symbol,
|
||
accession: row.accession,
|
||
form: row.form,
|
||
formZh: row.formZh,
|
||
filedDate: row.filedDate,
|
||
reportDate: row.reportDate,
|
||
description: row.description,
|
||
primaryDocument: row.primaryDocument,
|
||
url: row.primaryDocument
|
||
? edgarPrimaryUrl(cikNum, row.accession, row.primaryDocument)
|
||
: edgarTxtUrl(cikNum, row.accession),
|
||
localPrimary: archived?.localPrimary || null,
|
||
localTxt: archived?.localTxt || null,
|
||
excerpt,
|
||
isEarningsRelated: earningsFlag ? 1 : 0,
|
||
earningsExhibits: archived?.earningsExhibits ? JSON.stringify(archived.earningsExhibits) : null,
|
||
});
|
||
|
||
if (earningsFlag) {
|
||
upsertEarningsEvent({
|
||
symbol,
|
||
eventDate: row.reportDate || row.filedDate,
|
||
title: `${symbol} 財報/重大事件 8-K`,
|
||
titleZh: `${symbol} 財報公告(8-K Item 2.02)`,
|
||
timeLabel: '',
|
||
source: 'SEC 8-K',
|
||
url: archived?.localPrimary
|
||
? null
|
||
: (row.primaryDocument ? edgarPrimaryUrl(cikNum, row.accession, row.primaryDocument) : edgarTxtUrl(cikNum, row.accession)),
|
||
note: row.description || '已封存申報全文;法說逐字稿多由公司投資人關係頁發布',
|
||
kind: 'sec_8k',
|
||
accession: row.accession,
|
||
transcriptSearchUrl: investorUrl,
|
||
});
|
||
}
|
||
|
||
synced.push({
|
||
...row,
|
||
archived: !!archived?.archived,
|
||
localPrimary: archived?.localPrimary,
|
||
isEarningsRelated: earningsFlag,
|
||
});
|
||
}
|
||
|
||
const earnN = await syncEarningsCalendar(symbol).catch(() => 0);
|
||
|
||
const meta = {
|
||
symbol,
|
||
companyName: hit.name,
|
||
cik: hit.cik,
|
||
lastSyncAt: Date.now(),
|
||
filingCount: listSecFilings(symbol).length,
|
||
earningsCount: listEarningsEvents(symbol).length,
|
||
downloadedThisRun: downloaded,
|
||
earningsCalendarSynced: earnN,
|
||
};
|
||
saveSecArchiveMeta(symbol, meta);
|
||
|
||
return {
|
||
symbol,
|
||
skipped: false,
|
||
filings: listSecFilings(symbol),
|
||
earnings: listEarningsEvents(symbol),
|
||
meta,
|
||
synced,
|
||
};
|
||
}
|
||
|
||
export function getSecArchivePayload(symbol) {
|
||
symbol = String(symbol || '').trim().toUpperCase();
|
||
return {
|
||
symbol,
|
||
filings: listSecFilings(symbol),
|
||
earnings: listEarningsEvents(symbol),
|
||
meta: getSecArchiveMeta(symbol),
|
||
};
|
||
}
|
||
|
||
export function resolveArchiveFile(symbol, accession, file) {
|
||
symbol = String(symbol || '').trim().toUpperCase();
|
||
const dir = filingDir(symbol, accession);
|
||
if (!fs.existsSync(dir)) return null;
|
||
const safe = path.basename(String(file || 'primary.htm'));
|
||
const full = path.join(dir, safe);
|
||
if (!full.startsWith(dir)) return null;
|
||
if (!fs.existsSync(full)) {
|
||
const metaPath = path.join(dir, 'meta.json');
|
||
if (fs.existsSync(metaPath)) {
|
||
try {
|
||
const m = JSON.parse(fs.readFileSync(metaPath, 'utf8'));
|
||
if (m.localPrimary) {
|
||
const p = path.join(__dirname, '..', m.localPrimary);
|
||
if (fs.existsSync(p)) return p;
|
||
}
|
||
if (m.localTxt) {
|
||
const p = path.join(__dirname, '..', m.localTxt);
|
||
if (fs.existsSync(p)) return p;
|
||
}
|
||
} catch { /* */ }
|
||
}
|
||
return null;
|
||
}
|
||
return full;
|
||
} |