finance-dashboard/lib/news-text.js

99 lines
3.4 KiB
JavaScript
Raw Normal View History

2026-06-04 09:32:28 +00:00
// Google RSS / 新聞欄位HTML 實體解碼與摘要清理
export function decodeHtmlEntities(s) {
let t = String(s ?? '');
if (!t) return '';
t = t.replace(/&#x([0-9a-f]+);/gi, (_, hex) => {
const cp = parseInt(hex, 16);
return cp > 0 && cp < 0x110000 ? String.fromCodePoint(cp) : '';
});
t = t.replace(/&#(\d+);/g, (_, dec) => {
const cp = Number(dec);
return cp > 0 && cp < 0x110000 ? String.fromCodePoint(cp) : '';
});
const map = {
'&lt;': '<', '&gt;': '>', '&amp;': '&', '&quot;': '"', '&#39;': "'", '&apos;': "'",
'&nbsp;': ' ', '&#160;': ' ',
};
for (const [ent, ch] of Object.entries(map)) {
if (t.includes(ent)) t = t.split(ent).join(ch);
}
return t;
}
/** 解碼後移除標籤、壓縮空白 */
export function cleanNewsPlain(s) {
const decoded = decodeHtmlEntities(s);
return decoded
.replace(/<script[\s\S]*?<\/script>/gi, ' ')
.replace(/<style[\s\S]*?<\/style>/gi, ' ')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.trim();
}
function looksLikeHtmlGarbage(s) {
const t = String(s || '');
return /&lt;|&gt;|&amp;#|href\s*=|target\s*=\s*["']?_blank/i.test(t)
|| /^https?:\/\//i.test(t)
|| t.length > 120 && /news\.google\.com/i.test(t);
}
/** 從 Google RSS description 抽出摘要與媒體提示 */
export function parseGoogleRssDescription(raw) {
const decoded = decodeHtmlEntities(raw);
const anchorText = cleanNewsPlain(decoded.match(/<a[^>]*>([\s\S]*?)<\/a>/i)?.[1] || '');
const fontPub = cleanNewsPlain(decoded.match(/<font[^>]*>([\s\S]*?)<\/font>/i)?.[1] || '');
return { anchorText, fontPub };
}
export function cleanGoogleNewsTitle(raw) {
let title = cleanNewsPlain(raw);
// 「標題 - 媒體名」尾綴
title = title.replace(/\s*[-–—|]\s*[^-–—|]{1,48}$/, '').trim();
return title;
}
export function normalizeNewsItem(item = {}) {
const rawTitle = item.titleZh || item.title || '';
const titleZh = cleanGoogleNewsTitle(rawTitle) || cleanNewsPlain(rawTitle) || '(無標題)';
const titleEn = item.title && item.title !== rawTitle
? cleanGoogleNewsTitle(item.title)
: (item.title && item.title !== titleZh ? cleanGoogleNewsTitle(item.title) : '');
const rawPublisher = item.publisher || '';
let publisher = '';
if (looksLikeHtmlGarbage(rawPublisher)) {
const { fontPub } = parseGoogleRssDescription(rawPublisher);
publisher = fontPub || '';
} else {
publisher = cleanNewsPlain(rawPublisher);
}
if (!publisher || looksLikeHtmlGarbage(publisher)) {
const fromSource = cleanNewsPlain(item.source || '');
publisher = fromSource && !looksLikeHtmlGarbage(fromSource) && !/Google\s*新聞/i.test(fromSource)
? fromSource
: '';
}
if (!publisher || looksLikeHtmlGarbage(publisher)) publisher = '新聞';
let description = cleanNewsPlain(item.descriptionZh || item.description || '');
if (looksLikeHtmlGarbage(description)) description = '';
if (description && (description === titleZh || description === rawTitle || titleZh.includes(description))) {
description = '';
}
if (description && /news\.google\.com\/rss\/articles/i.test(description)) description = '';
return {
...item,
title: titleEn || titleZh,
titleZh,
description: description.slice(0, 400),
descriptionZh: description.slice(0, 400),
publisher,
};
}
export function normalizeNewsList(list) {
return (list || []).map(normalizeNewsItem);
}