338 lines
13 KiB
JavaScript
338 lines
13 KiB
JavaScript
// 公司研究:多來源抓取(台灣/國際新聞、簡介、10-K 供應鏈線索、管理層動態)
|
||
import { yahooQuoteSummary, yahooFinanceSearchNews } from './yahoo-session.js';
|
||
import {
|
||
cleanNewsPlain, cleanGoogleNewsTitle, parseGoogleRssDescription, normalizeNewsItem,
|
||
} from './news-text.js';
|
||
|
||
const UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124 Safari/537.36';
|
||
const SEC_UA = 'EmmyInvestDashboard/1.0 (personal learning tool; contact@example.com)';
|
||
|
||
async function text(url, headers = {}, ms = 14000) {
|
||
const ctrl = new AbortController();
|
||
const timer = setTimeout(() => ctrl.abort(), ms);
|
||
try {
|
||
const res = await fetch(url, { headers: { 'User-Agent': UA, ...headers }, signal: ctrl.signal });
|
||
if (!res.ok) throw new Error(`HTTP ${res.status}`);
|
||
return await res.text();
|
||
} finally { clearTimeout(timer); }
|
||
}
|
||
|
||
async function json(url, headers = {}, ms = 14000) {
|
||
return JSON.parse(await text(url, { Accept: 'application/json,text/plain,*/*', ...headers }, ms));
|
||
}
|
||
|
||
const strip = (s) => cleanNewsPlain(s);
|
||
const tag = (block, name) => block.match(new RegExp(`<${name}[^>]*>([\\s\\S]*?)<\\/${name}>`, 'i'))?.[1]?.trim() || '';
|
||
|
||
function parseGoogleRss(xml, region, limit = 12) {
|
||
const items = [...String(xml || '').matchAll(/<item>([\s\S]*?)<\/item>/gi)]
|
||
.map(m => m[1])
|
||
.slice(0, limit);
|
||
return items.map(block => {
|
||
const title = cleanGoogleNewsTitle(tag(block, 'title'));
|
||
const link = tag(block, 'link') || (block.match(/<link[^>]*>([^<]+)<\/link>/i)?.[1] || '').trim();
|
||
const pub = tag(block, 'pubDate');
|
||
const { anchorText, fontPub } = parseGoogleRssDescription(tag(block, 'description'));
|
||
const sourceName = cleanNewsPlain(tag(block, 'source'));
|
||
const publisher = sourceName || fontPub || 'Google 新聞';
|
||
let description = '';
|
||
if (anchorText && anchorText !== title && anchorText.length > 6 && !/news\.google\.com/i.test(anchorText)) {
|
||
description = anchorText;
|
||
}
|
||
return normalizeNewsItem({
|
||
title,
|
||
titleZh: title,
|
||
description: description.slice(0, 400),
|
||
descriptionZh: description.slice(0, 400),
|
||
url: link,
|
||
publisher,
|
||
created: pub ? new Date(pub).toISOString().slice(0, 10) : null,
|
||
region,
|
||
source: region === 'tw' ? 'Google 新聞(台灣)' : 'Google 新聞(國際)',
|
||
});
|
||
}).filter(n => n.titleZh && n.url);
|
||
}
|
||
|
||
export async function fetchTaiwanNews(symbol, companyName) {
|
||
const queries = [
|
||
/NVDA/i.test(symbol) ? '輝達' : null,
|
||
`${symbol} 台股`,
|
||
`${symbol} 美股`,
|
||
companyName && /[\u4e00-\u9fff]/.test(companyName) ? companyName : null,
|
||
].filter(Boolean);
|
||
const seen = new Set();
|
||
const out = [];
|
||
for (const q of queries) {
|
||
try {
|
||
const url = `https://news.google.com/rss/search?q=${encodeURIComponent(q)}&hl=zh-TW&gl=TW&ceid=TW:zh-Hant`;
|
||
const xml = await text(url, { Accept: 'application/rss+xml, application/xml, text/xml, */*' }, 10000);
|
||
for (const item of parseGoogleRss(xml, 'tw', 15)) {
|
||
const key = item.url;
|
||
if (seen.has(key)) continue;
|
||
seen.add(key);
|
||
out.push(item);
|
||
}
|
||
} catch { /* next query */ }
|
||
if (out.length >= 12) break;
|
||
}
|
||
return out.slice(0, 12);
|
||
}
|
||
|
||
export async function fetchGlobalNews(symbol) {
|
||
const out = [];
|
||
const seen = new Set();
|
||
try {
|
||
const yNews = await yahooFinanceSearchNews(symbol, 14);
|
||
for (const n of yNews) {
|
||
const item = normalizeNewsItem({
|
||
title: n.title,
|
||
titleZh: n.title,
|
||
description: strip(n.summary || ''),
|
||
descriptionZh: strip(n.summary || ''),
|
||
url: n.link,
|
||
publisher: n.publisher || 'Yahoo Finance',
|
||
created: n.providerPublishTime ? new Date(n.providerPublishTime * 1000).toISOString().slice(0, 10) : null,
|
||
region: 'global',
|
||
source: 'Yahoo Finance',
|
||
});
|
||
if (item.url && !seen.has(item.url)) { seen.add(item.url); out.push(item); }
|
||
}
|
||
} catch { /* */ }
|
||
try {
|
||
const y = await json(`https://query1.finance.yahoo.com/v1/finance/search?q=${encodeURIComponent(symbol)}&newsCount=12"esCount=0`);
|
||
for (const n of y.news || []) {
|
||
const item = normalizeNewsItem({
|
||
title: n.title,
|
||
titleZh: n.title,
|
||
description: strip(n.summary || ''),
|
||
descriptionZh: strip(n.summary || ''),
|
||
url: n.link,
|
||
publisher: n.publisher || 'Yahoo Finance',
|
||
created: n.providerPublishTime ? new Date(n.providerPublishTime * 1000).toISOString().slice(0, 10) : null,
|
||
region: 'global',
|
||
source: 'Yahoo Finance',
|
||
});
|
||
if (item.url && !seen.has(item.url)) { seen.add(item.url); out.push(item); }
|
||
}
|
||
} catch { /* */ }
|
||
|
||
for (const q of [`${symbol} stock`, `${symbol} earnings CEO`]) {
|
||
try {
|
||
const url = `https://news.google.com/rss/search?q=${encodeURIComponent(q)}&hl=en-US&gl=US&ceid=US:en`;
|
||
const xml = await text(url, {}, 10000);
|
||
for (const item of parseGoogleRss(xml, 'global', 10)) {
|
||
if (seen.has(item.url)) continue;
|
||
seen.add(item.url);
|
||
out.push(item);
|
||
}
|
||
} catch { /* */ }
|
||
if (out.length >= 14) break;
|
||
}
|
||
|
||
try {
|
||
const d = await json(`https://api.nasdaq.com/api/news/topic/articlebysymbol?q=${encodeURIComponent(symbol)}|stocks&offset=0&limit=8&fallback=true`, {
|
||
Accept: 'application/json', Origin: 'https://www.nasdaq.com', Referer: 'https://www.nasdaq.com/',
|
||
});
|
||
for (const r of d?.data?.rows || []) {
|
||
const url = r.url ? (r.url.startsWith('http') ? r.url : `https://www.nasdaq.com${r.url}`) : null;
|
||
if (!url || seen.has(url)) continue;
|
||
seen.add(url);
|
||
out.push(normalizeNewsItem({
|
||
title: r.title,
|
||
titleZh: r.title,
|
||
description: strip(r.description || ''),
|
||
descriptionZh: strip(r.description || ''),
|
||
url,
|
||
publisher: r.publisher || 'Nasdaq',
|
||
created: r.created || r.ago,
|
||
region: 'global',
|
||
source: 'Nasdaq',
|
||
}));
|
||
}
|
||
} catch { /* */ }
|
||
|
||
return out.slice(0, 14);
|
||
}
|
||
|
||
let _tickerMap = null;
|
||
async function tickerToCik(symbol) {
|
||
if (!_tickerMap) {
|
||
const d = await json('https://www.sec.gov/files/company_tickers.json', { 'User-Agent': SEC_UA });
|
||
_tickerMap = {};
|
||
for (const k of Object.keys(d)) _tickerMap[String(d[k].ticker).toUpperCase()] = { cik: String(d[k].cik_str).padStart(10, '0'), name: d[k].title };
|
||
}
|
||
return _tickerMap[symbol] || null;
|
||
}
|
||
|
||
export async function fetchCompanyProfileExtended(symbol, seed = {}) {
|
||
if (seed.longBusinessSummary && seed.sector) {
|
||
return {
|
||
symbol,
|
||
longBusinessSummary: seed.longBusinessSummary,
|
||
website: seed.website || null,
|
||
sector: seed.sector,
|
||
industry: seed.industry || null,
|
||
country: seed.country || null,
|
||
employees: seed.fullTimeEmployees ?? null,
|
||
peers: seed.peers || [],
|
||
source: seed.source || 'Yahoo assetProfile',
|
||
};
|
||
}
|
||
let profile = { symbol, longBusinessSummary: null, website: null, sector: null, industry: null, country: null, employees: null, peers: [] };
|
||
try {
|
||
const d = await yahooQuoteSummary(symbol, 'assetProfile,summaryProfile,peer');
|
||
const p = d?.assetProfile || {};
|
||
const sp = d?.summaryProfile || {};
|
||
const peers = (d?.peer?.symbols || [])
|
||
.map(s => String(s).split('.').pop()?.toUpperCase()).filter(s => s && s !== symbol);
|
||
profile = {
|
||
symbol,
|
||
longBusinessSummary: p.longBusinessSummary || sp.longBusinessSummary || null,
|
||
website: p.website || sp.website || null,
|
||
sector: p.sector || sp.sector || null,
|
||
industry: p.industry || sp.industry || null,
|
||
country: p.country || sp.country || null,
|
||
employees: p.fullTimeEmployees ?? sp.fullTimeEmployees ?? null,
|
||
peers: [...new Set(peers)].slice(0, 12),
|
||
source: 'Yahoo quoteSummary',
|
||
};
|
||
} catch { /* */ }
|
||
return profile;
|
||
}
|
||
|
||
function extractNamedEntities(section) {
|
||
const names = new Set();
|
||
const patterns = [
|
||
/(?:customers?|clients?|suppliers?|competitors?|partners?)[^.]{0,400}/gi,
|
||
/\b([A-Z][A-Za-z0-9&.\- ]{2,40}(?:Inc\.|Corp\.|Corporation|Ltd\.|LLC|Co\.))/g,
|
||
];
|
||
for (const re of patterns) {
|
||
for (const m of section.matchAll(re)) {
|
||
const chunk = m[1] || m[0];
|
||
const hits = chunk.match(/\b([A-Z][A-Za-z0-9&.\- ]{2,35}(?:Inc\.|Corp\.|Corporation|Ltd\.|LLC|Co\.))/g) || [];
|
||
for (const h of hits) {
|
||
const n = h.trim();
|
||
if (n.length > 3 && n.length < 50) names.add(n);
|
||
}
|
||
}
|
||
}
|
||
return [...names].slice(0, 15);
|
||
}
|
||
|
||
function extract10kSuppliers(plain) {
|
||
const names = new Set();
|
||
const chunks = [
|
||
plain.match(/(?:suppliers?|supply\s+chain|sole\s+supplier|third[- ]party\s+manufactur)[^.]{0,2000}/gi) || [],
|
||
plain.match(/(?:we\s+(?:rely|depend)\s+(?:on|upon)\s+)[^.]{0,800}/gi) || [],
|
||
plain.match(/(?:contract\s+manufactur|foundry)[^.]{0,1200}/gi) || [],
|
||
].flat();
|
||
for (const block of chunks) {
|
||
for (const n of extractNamedEntities(block)) names.add(n);
|
||
for (const m of block.matchAll(/\b(TSMC|Taiwan Semiconductor|Samsung|SK\s*Hynix|Micron|ASML|Synopsys|Cadence|Foxconn|Hon\s*Hai)\b/gi)) {
|
||
names.add(m[1].trim());
|
||
}
|
||
}
|
||
return [...names].slice(0, 18);
|
||
}
|
||
|
||
function extract10kCustomers(plain) {
|
||
const names = new Set();
|
||
const chunks = plain.match(/(?:major\s+customers?|principal\s+customers?|customers?\s+include|accounted\s+for\s+\d+%)[^.]{0,2000}/gi) || [];
|
||
for (const block of chunks) {
|
||
for (const n of extractNamedEntities(block)) names.add(n);
|
||
for (const m of block.matchAll(/\b(Microsoft|Amazon|Google|Alphabet|Meta|Apple|Tesla|Oracle)\b/gi)) {
|
||
names.add(m[1].trim());
|
||
}
|
||
for (const m of block.matchAll(/\b(Dell\s+Technologies|Hewlett[\s-]?Packard\s+Enterprise|Super\s*Micro\s+Computer|Lenovo|Cisco)\b/gi)) {
|
||
names.add(m[1].trim());
|
||
}
|
||
}
|
||
return [...names].slice(0, 18);
|
||
}
|
||
|
||
export async function fetch10kChainHints(symbol) {
|
||
const hit = await tickerToCik(symbol);
|
||
if (!hit) return { excerpt: null, customers: [], suppliers: [], competitors: [] };
|
||
const sub = await json(`https://data.sec.gov/submissions/CIK${hit.cik}.json`, { 'User-Agent': SEC_UA });
|
||
const f = sub.filings?.recent || {};
|
||
let accn = null;
|
||
let primary = null;
|
||
for (let i = 0; i < (f.form || []).length; i++) {
|
||
if (f.form[i] === '10-K') {
|
||
accn = f.accessionNumber[i];
|
||
primary = f.primaryDocument?.[i];
|
||
break;
|
||
}
|
||
}
|
||
if (!accn || !primary) return { excerpt: null, customers: [], suppliers: [], competitors: [] };
|
||
const accNo = accn.replace(/-/g, '');
|
||
const url = `https://www.sec.gov/Archives/edgar/data/${Number(hit.cik)}/${accNo}/${primary}`;
|
||
const html = await text(url, { 'User-Agent': SEC_UA }, 28000);
|
||
const plain = strip(html).slice(0, 180000);
|
||
const custSec = plain.match(/(?:major customers?|principal customers?|customers? include)[^.]{0,1200}/i)?.[0] || '';
|
||
const supSec = plain.match(/(?:suppliers?|supply chain|manufacturing)[^.]{0,1200}/i)?.[0] || '';
|
||
const compSec = plain.match(/(?:competition|competitors?)[^.]{0,1200}/i)?.[0] || '';
|
||
const bizSec = plain.match(/(?:business overview|description of business)[^.]{0,2500}/i)?.[0] || plain.slice(0, 2500);
|
||
const customers = [...new Set([...extractNamedEntities(custSec), ...extract10kCustomers(plain)])];
|
||
const suppliers = [...new Set([...extractNamedEntities(supSec), ...extract10kSuppliers(plain)])];
|
||
return {
|
||
excerpt: bizSec.slice(0, 2000),
|
||
customers,
|
||
suppliers,
|
||
competitors: extractNamedEntities(compSec),
|
||
source: 'SEC 10-K',
|
||
filingUrl: url,
|
||
companyName: hit.name,
|
||
};
|
||
}
|
||
|
||
const MGMT_KW = /chief executive|ceo|cfo|coo|president|board|director|executive|resign|appoint|compensation|guidance|layoff|restructur|merger|acquisition|investigation|subpoena|執行長|財務長|董事|人事|裁員|併購|收購|指引|調查/i;
|
||
|
||
export function filterManagementNews(news) {
|
||
return (news || []).filter(n => MGMT_KW.test(`${n.title} ${n.description}`)).slice(0, 10);
|
||
}
|
||
|
||
export async function fetchRecent8kHeadlines(symbol) {
|
||
const hit = await tickerToCik(symbol);
|
||
if (!hit) return [];
|
||
const sub = await json(`https://data.sec.gov/submissions/CIK${hit.cik}.json`, { 'User-Agent': SEC_UA });
|
||
const f = sub.filings?.recent || {};
|
||
const out = [];
|
||
for (let i = 0; i < (f.form || []).length && out.length < 8; i++) {
|
||
if (!/^8-K/i.test(f.form[i])) continue;
|
||
out.push({
|
||
form: f.form[i],
|
||
filedDate: f.filingDate[i],
|
||
description: f.primaryDocDescription?.[i] || '',
|
||
accession: f.accessionNumber[i],
|
||
url: `https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=${hit.cik}&type=8-K&dateb=&owner=include&count=40`,
|
||
});
|
||
}
|
||
return out;
|
||
}
|
||
|
||
export async function gatherIntelSources(symbol, profile = {}) {
|
||
symbol = String(symbol || '').trim().toUpperCase();
|
||
const [profileExt, hints, headlines] = await Promise.all([
|
||
fetchCompanyProfileExtended(symbol, profile).catch(() => ({})),
|
||
fetch10kChainHints(symbol).catch(() => ({})),
|
||
fetchRecent8kHeadlines(symbol).catch(() => []),
|
||
]);
|
||
const companyName = profile.name || profile.companyName || hints?.companyName || null;
|
||
const [newsTw, newsGlobal] = await Promise.all([
|
||
fetchTaiwanNews(symbol, companyName).catch(() => []),
|
||
fetchGlobalNews(symbol).catch(() => []),
|
||
]);
|
||
const mgmtRaw = filterManagementNews([...newsTw, ...newsGlobal]);
|
||
return {
|
||
symbol,
|
||
gatheredAt: new Date().toISOString(),
|
||
profileExt,
|
||
hints,
|
||
headlines8k: headlines,
|
||
newsTw,
|
||
newsGlobal,
|
||
managementNewsRaw: mgmtRaw,
|
||
companyName: companyName || hints?.companyName || profileExt?.symbol,
|
||
};
|
||
} |