haixunMaster/haixun-backend/internal/library/placement/crawler_exec.go

166 lines
4.4 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package placement
import (
"bytes"
"context"
"encoding/json"
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
)
// CrawlerSearchFn runs Playwright keyword search with a logged-in browser session.
type CrawlerSearchFn func(ctx context.Context, member MemberContext, keyword string, limit int) ([]DiscoverPost, error)
type execCrawlerInput struct {
StorageState string `json:"storage_state"`
Query string `json:"query"`
Limit int `json:"limit"`
}
type execCrawlerPost struct {
Text string `json:"text"`
Permalink string `json:"permalink"`
ExternalID string `json:"externalId"`
AuthorName string `json:"authorName"`
LikeCount int `json:"likeCount"`
ReplyCount int `json:"replyCount"`
AuthorVerified bool `json:"authorVerified"`
FollowerCount int `json:"followerCount"`
}
type execCrawlerOutput struct {
Posts []execCrawlerPost `json:"posts"`
}
// RunExecCrawlerSearch invokes the Node Playwright CLI (tsx) for keyword search.
func RunExecCrawlerSearch(ctx context.Context, storageState, keyword string, limit int) ([]DiscoverPost, error) {
keyword = strings.TrimSpace(keyword)
if keyword == "" {
return nil, nil
}
storageState = strings.TrimSpace(storageState)
if storageState == "" {
return nil, fmt.Errorf("找不到 Chrome session請先到連線頁同步 Threads 登入態")
}
if limit <= 0 {
limit = 12
}
repoRoot, cliPath, err := resolveKeywordSearchCLI()
if err != nil {
return nil, err
}
payload, err := json.Marshal(execCrawlerInput{
StorageState: storageState,
Query: keyword,
Limit: limit,
})
if err != nil {
return nil, err
}
runCtx, cancel := context.WithTimeout(ctx, 3*time.Minute)
defer cancel()
cmd := exec.CommandContext(runCtx, "npx", "tsx", cliPath)
cmd.Dir = repoRoot
cmd.Stdin = bytes.NewReader(payload)
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
msg := strings.TrimSpace(stderr.String())
if msg == "" {
msg = err.Error()
}
return nil, fmt.Errorf("crawler search failed: %s", msg)
}
var out execCrawlerOutput
if err := json.Unmarshal(stdout.Bytes(), &out); err != nil {
return nil, fmt.Errorf("crawler search output parse failed: %w", err)
}
posts := make([]DiscoverPost, 0, len(out.Posts))
for _, item := range out.Posts {
text := strings.TrimSpace(item.Text)
if text == "" {
continue
}
author := strings.TrimSpace(item.AuthorName)
permalink := strings.TrimSpace(item.Permalink)
extID := strings.TrimSpace(item.ExternalID)
posts = append(posts, DiscoverPost{
Text: text,
Permalink: permalink,
ExternalID: extID,
Author: author,
AuthorVerified: item.AuthorVerified,
FollowerCount: item.FollowerCount,
LikeCount: item.LikeCount,
ReplyCount: item.ReplyCount,
Source: DiscoverCrawler,
})
}
return posts, nil
}
func resolveKeywordSearchCLI() (repoRoot, cliPath string, err error) {
if root := strings.TrimSpace(os.Getenv("HAIXUN_REPO_ROOT")); root != "" {
cli := filepath.Join(root, "haixun-backend", "worker", "threads-keyword-search-cli.ts")
if fileExists(cli) {
return root, cli, nil
}
}
cwd, err := os.Getwd()
if err != nil {
return "", "", fmt.Errorf("resolve crawler cli: %w", err)
}
dir := cwd
for i := 0; i < 6; i++ {
cli := filepath.Join(dir, "haixun-backend", "worker", "threads-keyword-search-cli.ts")
if fileExists(cli) {
return dir, cli, nil
}
cli = filepath.Join(dir, "worker", "threads-keyword-search-cli.ts")
if fileExists(cli) {
return dir, cli, nil
}
parent := filepath.Dir(dir)
if parent == dir {
break
}
dir = parent
}
return "", "", fmt.Errorf("找不到 threads-keyword-search-cli.ts請設定 HAIXUN_REPO_ROOT")
}
func fileExists(path string) bool {
info, err := os.Stat(path)
return err == nil && !info.IsDir()
}
// CrawlerKeywordFromQuery extracts plain keyword from Brave-style query strings.
func CrawlerKeywordFromQuery(query, keyword string) string {
if k := strings.TrimSpace(keyword); k != "" {
return k
}
q := strings.TrimSpace(query)
q = strings.TrimPrefix(q, "site:threads.net ")
q = strings.Trim(q, `"`)
if idx := strings.Index(q, " after:"); idx > 0 {
q = strings.TrimSpace(q[:idx])
}
q = strings.Trim(q, `"`)
if idx := strings.Index(q, " 請問"); idx > 0 {
q = strings.TrimSpace(q[:idx])
}
return strings.Trim(q, `"`)
}