99 lines
3.4 KiB
JavaScript
99 lines
3.4 KiB
JavaScript
// Google RSS / 新聞欄位:HTML 實體解碼與摘要清理
|
||
|
||
export function decodeHtmlEntities(s) {
|
||
let t = String(s ?? '');
|
||
if (!t) return '';
|
||
t = t.replace(/&#x([0-9a-f]+);/gi, (_, hex) => {
|
||
const cp = parseInt(hex, 16);
|
||
return cp > 0 && cp < 0x110000 ? String.fromCodePoint(cp) : '';
|
||
});
|
||
t = t.replace(/&#(\d+);/g, (_, dec) => {
|
||
const cp = Number(dec);
|
||
return cp > 0 && cp < 0x110000 ? String.fromCodePoint(cp) : '';
|
||
});
|
||
const map = {
|
||
'<': '<', '>': '>', '&': '&', '"': '"', ''': "'", ''': "'",
|
||
' ': ' ', ' ': ' ',
|
||
};
|
||
for (const [ent, ch] of Object.entries(map)) {
|
||
if (t.includes(ent)) t = t.split(ent).join(ch);
|
||
}
|
||
return t;
|
||
}
|
||
|
||
/** 解碼後移除標籤、壓縮空白 */
|
||
export function cleanNewsPlain(s) {
|
||
const decoded = decodeHtmlEntities(s);
|
||
return decoded
|
||
.replace(/<script[\s\S]*?<\/script>/gi, ' ')
|
||
.replace(/<style[\s\S]*?<\/style>/gi, ' ')
|
||
.replace(/<[^>]+>/g, ' ')
|
||
.replace(/\s+/g, ' ')
|
||
.trim();
|
||
}
|
||
|
||
function looksLikeHtmlGarbage(s) {
|
||
const t = String(s || '');
|
||
return /<|>|&#|href\s*=|target\s*=\s*["']?_blank/i.test(t)
|
||
|| /^https?:\/\//i.test(t)
|
||
|| t.length > 120 && /news\.google\.com/i.test(t);
|
||
}
|
||
|
||
/** 從 Google RSS description 抽出摘要與媒體提示 */
|
||
export function parseGoogleRssDescription(raw) {
|
||
const decoded = decodeHtmlEntities(raw);
|
||
const anchorText = cleanNewsPlain(decoded.match(/<a[^>]*>([\s\S]*?)<\/a>/i)?.[1] || '');
|
||
const fontPub = cleanNewsPlain(decoded.match(/<font[^>]*>([\s\S]*?)<\/font>/i)?.[1] || '');
|
||
return { anchorText, fontPub };
|
||
}
|
||
|
||
export function cleanGoogleNewsTitle(raw) {
|
||
let title = cleanNewsPlain(raw);
|
||
// 「標題 - 媒體名」尾綴
|
||
title = title.replace(/\s*[-–—||]\s*[^-–—||]{1,48}$/, '').trim();
|
||
return title;
|
||
}
|
||
|
||
export function normalizeNewsItem(item = {}) {
|
||
const rawTitle = item.titleZh || item.title || '';
|
||
const titleZh = cleanGoogleNewsTitle(rawTitle) || cleanNewsPlain(rawTitle) || '(無標題)';
|
||
const titleEn = item.title && item.title !== rawTitle
|
||
? cleanGoogleNewsTitle(item.title)
|
||
: (item.title && item.title !== titleZh ? cleanGoogleNewsTitle(item.title) : '');
|
||
|
||
const rawPublisher = item.publisher || '';
|
||
let publisher = '';
|
||
if (looksLikeHtmlGarbage(rawPublisher)) {
|
||
const { fontPub } = parseGoogleRssDescription(rawPublisher);
|
||
publisher = fontPub || '';
|
||
} else {
|
||
publisher = cleanNewsPlain(rawPublisher);
|
||
}
|
||
if (!publisher || looksLikeHtmlGarbage(publisher)) {
|
||
const fromSource = cleanNewsPlain(item.source || '');
|
||
publisher = fromSource && !looksLikeHtmlGarbage(fromSource) && !/Google\s*新聞/i.test(fromSource)
|
||
? fromSource
|
||
: '';
|
||
}
|
||
if (!publisher || looksLikeHtmlGarbage(publisher)) publisher = '新聞';
|
||
|
||
let description = cleanNewsPlain(item.descriptionZh || item.description || '');
|
||
if (looksLikeHtmlGarbage(description)) description = '';
|
||
if (description && (description === titleZh || description === rawTitle || titleZh.includes(description))) {
|
||
description = '';
|
||
}
|
||
if (description && /news\.google\.com\/rss\/articles/i.test(description)) description = '';
|
||
|
||
return {
|
||
...item,
|
||
title: titleEn || titleZh,
|
||
titleZh,
|
||
description: description.slice(0, 400),
|
||
descriptionZh: description.slice(0, 400),
|
||
publisher,
|
||
};
|
||
}
|
||
|
||
export function normalizeNewsList(list) {
|
||
return (list || []).map(normalizeNewsItem);
|
||
} |