finance-dashboard/lib/news-text.js

// Google RSS / 新聞欄位：HTML 實體解碼與摘要清理

export function decodeHtmlEntities(s) {
  let t = String(s ?? '');
  if (!t) return '';
  t = t.replace(/&#x([0-9a-f]+);/gi, (_, hex) => {
    const cp = parseInt(hex, 16);
    return cp > 0 && cp < 0x110000 ? String.fromCodePoint(cp) : '';
  });
  t = t.replace(/&#(\d+);/g, (_, dec) => {
    const cp = Number(dec);
    return cp > 0 && cp < 0x110000 ? String.fromCodePoint(cp) : '';
  });
  const map = {
    '&lt;': '<', '&gt;': '>', '&amp;': '&', '&quot;': '"', '&#39;': "'", '&apos;': "'",
    '&nbsp;': ' ', '&#160;': ' ',
  };
  for (const [ent, ch] of Object.entries(map)) {
    if (t.includes(ent)) t = t.split(ent).join(ch);
  }
  return t;
}

/** 解碼後移除標籤、壓縮空白 */
export function cleanNewsPlain(s) {
  const decoded = decodeHtmlEntities(s);
  return decoded
    .replace(/<script[\s\S]*?<\/script>/gi, ' ')
    .replace(/<style[\s\S]*?<\/style>/gi, ' ')
    .replace(/<[^>]+>/g, ' ')
    .replace(/\s+/g, ' ')
    .trim();
}

function looksLikeHtmlGarbage(s) {
  const t = String(s || '');
  return /&lt;|&gt;|&amp;#|href\s*=|target\s*=\s*["']?_blank/i.test(t)
    || /^https?:\/\//i.test(t)
    || t.length > 120 && /news\.google\.com/i.test(t);
}

/** 從 Google RSS description 抽出摘要與媒體提示 */
export function parseGoogleRssDescription(raw) {
  const decoded = decodeHtmlEntities(raw);
  const anchorText = cleanNewsPlain(decoded.match(/<a[^>]*>([\s\S]*?)<\/a>/i)?.[1] || '');
  const fontPub = cleanNewsPlain(decoded.match(/<font[^>]*>([\s\S]*?)<\/font>/i)?.[1] || '');
  return { anchorText, fontPub };
}

export function cleanGoogleNewsTitle(raw) {
  let title = cleanNewsPlain(raw);
  // 「標題 - 媒體名」尾綴
  title = title.replace(/\s*[-–—|｜]\s*[^-–—|｜]{1,48}$/, '').trim();
  return title;
}

export function normalizeNewsItem(item = {}) {
  const rawTitle = item.titleZh || item.title || '';
  const titleZh = cleanGoogleNewsTitle(rawTitle) || cleanNewsPlain(rawTitle) || '（無標題）';
  const titleEn = item.title && item.title !== rawTitle
    ? cleanGoogleNewsTitle(item.title)
    : (item.title && item.title !== titleZh ? cleanGoogleNewsTitle(item.title) : '');

  const rawPublisher = item.publisher || '';
  let publisher = '';
  if (looksLikeHtmlGarbage(rawPublisher)) {
    const { fontPub } = parseGoogleRssDescription(rawPublisher);
    publisher = fontPub || '';
  } else {
    publisher = cleanNewsPlain(rawPublisher);
  }
  if (!publisher || looksLikeHtmlGarbage(publisher)) {
    const fromSource = cleanNewsPlain(item.source || '');
    publisher = fromSource && !looksLikeHtmlGarbage(fromSource) && !/Google\s*新聞/i.test(fromSource)
      ? fromSource
      : '';
  }
  if (!publisher || looksLikeHtmlGarbage(publisher)) publisher = '新聞';

  let description = cleanNewsPlain(item.descriptionZh || item.description || '');
  if (looksLikeHtmlGarbage(description)) description = '';
  if (description && (description === titleZh || description === rawTitle || titleZh.includes(description))) {
    description = '';
  }
  if (description && /news\.google\.com\/rss\/articles/i.test(description)) description = '';

  return {
    ...item,
    title: titleEn || titleZh,
    titleZh,
    description: description.slice(0, 400),
    descriptionZh: description.slice(0, 400),
    publisher,
  };
}

export function normalizeNewsList(list) {
  return (list || []).map(normalizeNewsItem);
}