finance-dashboard/lib/news-text.js

99 lines
3.4 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Google RSS / 新聞欄位HTML 實體解碼與摘要清理
export function decodeHtmlEntities(s) {
let t = String(s ?? '');
if (!t) return '';
t = t.replace(/&#x([0-9a-f]+);/gi, (_, hex) => {
const cp = parseInt(hex, 16);
return cp > 0 && cp < 0x110000 ? String.fromCodePoint(cp) : '';
});
t = t.replace(/&#(\d+);/g, (_, dec) => {
const cp = Number(dec);
return cp > 0 && cp < 0x110000 ? String.fromCodePoint(cp) : '';
});
const map = {
'&lt;': '<', '&gt;': '>', '&amp;': '&', '&quot;': '"', '&#39;': "'", '&apos;': "'",
'&nbsp;': ' ', '&#160;': ' ',
};
for (const [ent, ch] of Object.entries(map)) {
if (t.includes(ent)) t = t.split(ent).join(ch);
}
return t;
}
/** 解碼後移除標籤、壓縮空白 */
export function cleanNewsPlain(s) {
const decoded = decodeHtmlEntities(s);
return decoded
.replace(/<script[\s\S]*?<\/script>/gi, ' ')
.replace(/<style[\s\S]*?<\/style>/gi, ' ')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.trim();
}
function looksLikeHtmlGarbage(s) {
const t = String(s || '');
return /&lt;|&gt;|&amp;#|href\s*=|target\s*=\s*["']?_blank/i.test(t)
|| /^https?:\/\//i.test(t)
|| t.length > 120 && /news\.google\.com/i.test(t);
}
/** 從 Google RSS description 抽出摘要與媒體提示 */
export function parseGoogleRssDescription(raw) {
const decoded = decodeHtmlEntities(raw);
const anchorText = cleanNewsPlain(decoded.match(/<a[^>]*>([\s\S]*?)<\/a>/i)?.[1] || '');
const fontPub = cleanNewsPlain(decoded.match(/<font[^>]*>([\s\S]*?)<\/font>/i)?.[1] || '');
return { anchorText, fontPub };
}
export function cleanGoogleNewsTitle(raw) {
let title = cleanNewsPlain(raw);
// 「標題 - 媒體名」尾綴
title = title.replace(/\s*[-–—|]\s*[^-–—|]{1,48}$/, '').trim();
return title;
}
export function normalizeNewsItem(item = {}) {
const rawTitle = item.titleZh || item.title || '';
const titleZh = cleanGoogleNewsTitle(rawTitle) || cleanNewsPlain(rawTitle) || '(無標題)';
const titleEn = item.title && item.title !== rawTitle
? cleanGoogleNewsTitle(item.title)
: (item.title && item.title !== titleZh ? cleanGoogleNewsTitle(item.title) : '');
const rawPublisher = item.publisher || '';
let publisher = '';
if (looksLikeHtmlGarbage(rawPublisher)) {
const { fontPub } = parseGoogleRssDescription(rawPublisher);
publisher = fontPub || '';
} else {
publisher = cleanNewsPlain(rawPublisher);
}
if (!publisher || looksLikeHtmlGarbage(publisher)) {
const fromSource = cleanNewsPlain(item.source || '');
publisher = fromSource && !looksLikeHtmlGarbage(fromSource) && !/Google\s*新聞/i.test(fromSource)
? fromSource
: '';
}
if (!publisher || looksLikeHtmlGarbage(publisher)) publisher = '新聞';
let description = cleanNewsPlain(item.descriptionZh || item.description || '');
if (looksLikeHtmlGarbage(description)) description = '';
if (description && (description === titleZh || description === rawTitle || titleZh.includes(description))) {
description = '';
}
if (description && /news\.google\.com\/rss\/articles/i.test(description)) description = '';
return {
...item,
title: titleEn || titleZh,
titleZh,
description: description.slice(0, 400),
descriptionZh: description.slice(0, 400),
publisher,
};
}
export function normalizeNewsList(list) {
return (list || []).map(normalizeNewsItem);
}