// Google RSS / 新聞欄位:HTML 實體解碼與摘要清理 export function decodeHtmlEntities(s) { let t = String(s ?? ''); if (!t) return ''; t = t.replace(/&#x([0-9a-f]+);/gi, (_, hex) => { const cp = parseInt(hex, 16); return cp > 0 && cp < 0x110000 ? String.fromCodePoint(cp) : ''; }); t = t.replace(/&#(\d+);/g, (_, dec) => { const cp = Number(dec); return cp > 0 && cp < 0x110000 ? String.fromCodePoint(cp) : ''; }); const map = { '<': '<', '>': '>', '&': '&', '"': '"', ''': "'", ''': "'", ' ': ' ', ' ': ' ', }; for (const [ent, ch] of Object.entries(map)) { if (t.includes(ent)) t = t.split(ent).join(ch); } return t; } /** 解碼後移除標籤、壓縮空白 */ export function cleanNewsPlain(s) { const decoded = decodeHtmlEntities(s); return decoded .replace(//gi, ' ') .replace(//gi, ' ') .replace(/<[^>]+>/g, ' ') .replace(/\s+/g, ' ') .trim(); } function looksLikeHtmlGarbage(s) { const t = String(s || ''); return /<|>|&#|href\s*=|target\s*=\s*["']?_blank/i.test(t) || /^https?:\/\//i.test(t) || t.length > 120 && /news\.google\.com/i.test(t); } /** 從 Google RSS description 抽出摘要與媒體提示 */ export function parseGoogleRssDescription(raw) { const decoded = decodeHtmlEntities(raw); const anchorText = cleanNewsPlain(decoded.match(/]*>([\s\S]*?)<\/a>/i)?.[1] || ''); const fontPub = cleanNewsPlain(decoded.match(/]*>([\s\S]*?)<\/font>/i)?.[1] || ''); return { anchorText, fontPub }; } export function cleanGoogleNewsTitle(raw) { let title = cleanNewsPlain(raw); // 「標題 - 媒體名」尾綴 title = title.replace(/\s*[-–—||]\s*[^-–—||]{1,48}$/, '').trim(); return title; } export function normalizeNewsItem(item = {}) { const rawTitle = item.titleZh || item.title || ''; const titleZh = cleanGoogleNewsTitle(rawTitle) || cleanNewsPlain(rawTitle) || '(無標題)'; const titleEn = item.title && item.title !== rawTitle ? cleanGoogleNewsTitle(item.title) : (item.title && item.title !== titleZh ? cleanGoogleNewsTitle(item.title) : ''); const rawPublisher = item.publisher || ''; let publisher = ''; if (looksLikeHtmlGarbage(rawPublisher)) { const { fontPub } = parseGoogleRssDescription(rawPublisher); publisher = fontPub || ''; } else { publisher = cleanNewsPlain(rawPublisher); } if (!publisher || looksLikeHtmlGarbage(publisher)) { const fromSource = cleanNewsPlain(item.source || ''); publisher = fromSource && !looksLikeHtmlGarbage(fromSource) && !/Google\s*新聞/i.test(fromSource) ? fromSource : ''; } if (!publisher || looksLikeHtmlGarbage(publisher)) publisher = '新聞'; let description = cleanNewsPlain(item.descriptionZh || item.description || ''); if (looksLikeHtmlGarbage(description)) description = ''; if (description && (description === titleZh || description === rawTitle || titleZh.includes(description))) { description = ''; } if (description && /news\.google\.com\/rss\/articles/i.test(description)) description = ''; return { ...item, title: titleEn || titleZh, titleZh, description: description.slice(0, 400), descriptionZh: description.slice(0, 400), publisher, }; } export function normalizeNewsList(list) { return (list || []).map(normalizeNewsItem); }