thread-master/backend/internal/library/placement/crawler_exec.go

166 lines
4.4 KiB
Go
Raw Permalink Normal View History

2026-06-26 08:37:04 +00:00
package placement
import (
"bytes"
"context"
"encoding/json"
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
)
// CrawlerSearchFn runs Playwright keyword search with a logged-in browser session.
type CrawlerSearchFn func(ctx context.Context, member MemberContext, keyword string, limit int) ([]DiscoverPost, error)
type execCrawlerInput struct {
StorageState string `json:"storage_state"`
Query string `json:"query"`
Limit int `json:"limit"`
}
type execCrawlerPost struct {
Text string `json:"text"`
Permalink string `json:"permalink"`
ExternalID string `json:"externalId"`
AuthorName string `json:"authorName"`
LikeCount int `json:"likeCount"`
ReplyCount int `json:"replyCount"`
AuthorVerified bool `json:"authorVerified"`
FollowerCount int `json:"followerCount"`
}
type execCrawlerOutput struct {
Posts []execCrawlerPost `json:"posts"`
}
// RunExecCrawlerSearch invokes the Node Playwright CLI (tsx) for keyword search.
func RunExecCrawlerSearch(ctx context.Context, storageState, keyword string, limit int) ([]DiscoverPost, error) {
keyword = strings.TrimSpace(keyword)
if keyword == "" {
return nil, nil
}
storageState = strings.TrimSpace(storageState)
if storageState == "" {
return nil, fmt.Errorf("找不到 Chrome session請先到連線頁同步 Threads 登入態")
}
if limit <= 0 {
limit = 12
}
repoRoot, cliPath, err := resolveKeywordSearchCLI()
if err != nil {
return nil, err
}
payload, err := json.Marshal(execCrawlerInput{
StorageState: storageState,
Query: keyword,
Limit: limit,
})
if err != nil {
return nil, err
}
runCtx, cancel := context.WithTimeout(ctx, 3*time.Minute)
defer cancel()
cmd := exec.CommandContext(runCtx, "npx", "tsx", cliPath)
cmd.Dir = repoRoot
cmd.Stdin = bytes.NewReader(payload)
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
msg := strings.TrimSpace(stderr.String())
if msg == "" {
msg = err.Error()
}
return nil, fmt.Errorf("crawler search failed: %s", msg)
}
var out execCrawlerOutput
if err := json.Unmarshal(stdout.Bytes(), &out); err != nil {
return nil, fmt.Errorf("crawler search output parse failed: %w", err)
}
posts := make([]DiscoverPost, 0, len(out.Posts))
for _, item := range out.Posts {
text := strings.TrimSpace(item.Text)
if text == "" {
continue
}
author := strings.TrimSpace(item.AuthorName)
permalink := strings.TrimSpace(item.Permalink)
extID := strings.TrimSpace(item.ExternalID)
posts = append(posts, DiscoverPost{
Text: text,
Permalink: permalink,
ExternalID: extID,
Author: author,
AuthorVerified: item.AuthorVerified,
FollowerCount: item.FollowerCount,
LikeCount: item.LikeCount,
ReplyCount: item.ReplyCount,
Source: DiscoverCrawler,
})
}
return posts, nil
}
func resolveKeywordSearchCLI() (repoRoot, cliPath string, err error) {
if root := strings.TrimSpace(os.Getenv("HAIXUN_REPO_ROOT")); root != "" {
cli := filepath.Join(root, "haixun-backend", "worker", "threads-keyword-search-cli.ts")
if fileExists(cli) {
return root, cli, nil
}
}
cwd, err := os.Getwd()
if err != nil {
return "", "", fmt.Errorf("resolve crawler cli: %w", err)
}
dir := cwd
for i := 0; i < 6; i++ {
cli := filepath.Join(dir, "haixun-backend", "worker", "threads-keyword-search-cli.ts")
if fileExists(cli) {
return dir, cli, nil
}
cli = filepath.Join(dir, "worker", "threads-keyword-search-cli.ts")
if fileExists(cli) {
return dir, cli, nil
}
parent := filepath.Dir(dir)
if parent == dir {
break
}
dir = parent
}
return "", "", fmt.Errorf("找不到 threads-keyword-search-cli.ts請設定 HAIXUN_REPO_ROOT")
}
func fileExists(path string) bool {
info, err := os.Stat(path)
return err == nil && !info.IsDir()
}
// CrawlerKeywordFromQuery extracts plain keyword from Brave-style query strings.
func CrawlerKeywordFromQuery(query, keyword string) string {
if k := strings.TrimSpace(keyword); k != "" {
return k
}
q := strings.TrimSpace(query)
q = strings.TrimPrefix(q, "site:threads.net ")
q = strings.Trim(q, `"`)
if idx := strings.Index(q, " after:"); idx > 0 {
q = strings.TrimSpace(q[:idx])
}
q = strings.Trim(q, `"`)
if idx := strings.Index(q, " 請問"); idx > 0 {
q = strings.TrimSpace(q[:idx])
}
return strings.Trim(q, `"`)
}