From 5614da46482aeea9891e22f0120c837933ca7e61 Mon Sep 17 00:00:00 2001 From: partisan Date: Sun, 9 Jun 2024 12:43:46 +0200 Subject: [PATCH] wip, revert changes --- get-searchxng.go | 14 +++--- go.mod | 12 +++++- go.sum | 22 ++++++++++ text-duckduckgo.go | 77 ++------------------------------- text-google.go | 105 +++++++++++++++++++++++---------------------- text-quant.go | 15 ++----- text.go | 2 +- 7 files changed, 103 insertions(+), 144 deletions(-) diff --git a/get-searchxng.go b/get-searchxng.go index cb88261..b6da71a 100644 --- a/get-searchxng.go +++ b/get-searchxng.go @@ -128,10 +128,10 @@ func isInstanceValid(instance SearXInstance) bool { } } -func main() { - instance, err := getRandomSearXInstance() - if err != nil { - log.Fatalf("Failed to get a SearX instance: %v", err) - } - fmt.Printf("Selected SearX instance: %s\n", instance.URL) -} +// func main() { +// instance, err := getRandomSearXInstance() +// if err != nil { +// log.Fatalf("Failed to get a SearX instance: %v", err) +// } +// fmt.Printf("Selected SearX instance: %s\n", instance.URL) +// } diff --git a/go.mod b/go.mod index 4f6d6fa..0cca960 100644 --- a/go.mod +++ b/go.mod @@ -2,8 +2,18 @@ module searchengine go 1.18 +require github.com/PuerkitoBio/goquery v1.9.1 // direct + require ( - github.com/PuerkitoBio/goquery v1.9.1 // direct github.com/andybalholm/cascadia v1.3.2 // indirect + github.com/chromedp/cdproto v0.0.0-20240202021202-6d0b6a386732 // indirect + github.com/chromedp/chromedp v0.9.5 // indirect + github.com/chromedp/sysutil v1.0.0 // indirect + github.com/gobwas/httphead v0.1.0 // indirect + github.com/gobwas/pool v0.2.1 // indirect + github.com/gobwas/ws v1.3.2 // indirect + github.com/josharian/intern v1.0.0 // indirect + github.com/mailru/easyjson v0.7.7 // indirect golang.org/x/net v0.21.0 // indirect + golang.org/x/sys v0.17.0 // indirect ) diff --git a/go.sum b/go.sum index f988942..f919d3c 100644 --- a/go.sum +++ b/go.sum @@ -2,6 +2,24 @@ github.com/PuerkitoBio/goquery v1.9.1 h1:mTL6XjbJTZdpfL+Gwl5U2h1l9yEkJjhmlTeV9VP github.com/PuerkitoBio/goquery v1.9.1/go.mod h1:cW1n6TmIMDoORQU5IU/P1T3tGFunOeXEpGP2WHRwkbY= github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= +github.com/chromedp/cdproto v0.0.0-20240202021202-6d0b6a386732 h1:XYUCaZrW8ckGWlCRJKCSoh/iFwlpX316a8yY9IFEzv8= +github.com/chromedp/cdproto v0.0.0-20240202021202-6d0b6a386732/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= +github.com/chromedp/chromedp v0.9.5 h1:viASzruPJOiThk7c5bueOUY91jGLJVximoEMGoH93rg= +github.com/chromedp/chromedp v0.9.5/go.mod h1:D4I2qONslauw/C7INoCir1BJkSwBYMyZgx8X276z3+Y= +github.com/chromedp/sysutil v1.0.0 h1:+ZxhTpfpZlmchB58ih/LBHX52ky7w2VhQVKQMucy3Ic= +github.com/chromedp/sysutil v1.0.0/go.mod h1:kgWmDdq8fTzXYcKIBqIYvRRTnYb9aNS9moAV0xufSww= +github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU= +github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM= +github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og= +github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw= +github.com/gobwas/ws v1.3.2 h1:zlnbNHxumkRvfPWgfXu8RBwyNR1x8wh9cf5PTOCqs9Q= +github.com/gobwas/ws v1.3.2/go.mod h1:hRKAFb8wOxFROYNsT1bqfWnhX+b5MFeJM9r2ZSwg/KY= +github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= +github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs= +github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= +github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= @@ -23,7 +41,11 @@ golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.17.0 h1:25cE3gD+tdBA7lp7QfhuV+rJiE9YXTcS3VG1SqssI/Y= +golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= diff --git a/text-duckduckgo.go b/text-duckduckgo.go index 881256b..d003895 100644 --- a/text-duckduckgo.go +++ b/text-duckduckgo.go @@ -1,3 +1,4 @@ +// text-duckduckgo.go package main import ( @@ -6,66 +7,15 @@ import ( "net/http" "net/url" "strings" - "time" "github.com/PuerkitoBio/goquery" ) -const ( - resultsPerPage = 10 -) - -func getVQD(query string) (string, error) { - queryURL := fmt.Sprintf("https://duckduckgo.com/?q=%s", url.QueryEscape(query)) - resp, err := http.Get(queryURL) - if err != nil { - return "", fmt.Errorf("failed to fetch vqd: %v", err) - } - defer resp.Body.Close() - - doc, err := goquery.NewDocumentFromReader(resp.Body) - if err != nil { - return "", fmt.Errorf("loading HTML document: %v", err) - } - - var vqd string - doc.Find("script").Each(func(i int, s *goquery.Selection) { - text := s.Text() - if strings.Contains(text, "vqd=\"") { - start := strings.Index(text, "vqd=\"") + 5 - end := strings.Index(text[start:], "\"") - vqd = text[start : start+end] - } - }) - - if vqd == "" { - return "", fmt.Errorf("vqd not found") - } - - return vqd, nil -} - -func PerformDuckDuckGoTextSearch(query, safe, lang string, page int) ([]TextSearchResult, error) { +func PerformDuckDuckGoTextSearch(query, safe, lang string) ([]TextSearchResult, error) { var results []TextSearchResult + searchURL := fmt.Sprintf("https://duckduckgo.com/html/?q=%s", url.QueryEscape(query)) - client := &http.Client{Timeout: 10 * time.Second} - - vqd, err := getVQD(query) - if err != nil { - return nil, fmt.Errorf("failed to get vqd: %v", err) - } - - searchURL := fmt.Sprintf("https://duckduckgo.com/html/?q=%s&kl=%s&safe=%s&s=%d&vqd=%s", - url.QueryEscape(query), lang, safe, (page-1)*resultsPerPage, vqd) - - req, err := http.NewRequest("GET", searchURL, nil) - if err != nil { - return nil, fmt.Errorf("failed to create request: %v", err) - } - - req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36") - - resp, err := client.Do(req) + resp, err := http.Get(searchURL) if err != nil { return nil, fmt.Errorf("making request: %v", err) } @@ -94,34 +44,15 @@ func PerformDuckDuckGoTextSearch(query, safe, lang string, page int) ([]TextSear URL: uddg, Header: strings.TrimSpace(header), Description: strings.TrimSpace(description), - Source: "DuckDuckGo", } results = append(results, result) if debugMode { log.Printf("Processed DuckDuckGo result: %+v\n", result) } - } else { - if debugMode { - log.Printf("Missing 'uddg' parameter in URL: %s\n", rawURL) - } } - } else { - if debugMode { - log.Printf("Error parsing URL: %s, error: %v\n", rawURL, err) - } - } - } else { - if debugMode { - log.Printf("Missing 'href' attribute in result anchor tag\n") } } }) - if len(results) == 0 { - if debugMode { - log.Println("No results found from DuckDuckGo") - } - } - return results, nil } diff --git a/text-google.go b/text-google.go index 9c338cc..c69c5ba 100644 --- a/text-google.go +++ b/text-google.go @@ -1,61 +1,66 @@ package main import ( + "context" "fmt" - "log" - "net/http" "net/url" "strings" + "time" "github.com/PuerkitoBio/goquery" + "github.com/chromedp/chromedp" ) -func PerformGoogleTextSearch(query, safe, lang string, page int) ([]TextSearchResult, error) { - const resultsPerPage = 10 +// type TextSearchResult struct { +// URL string +// Header string +// Description string +// } + +// func main() { +// // Example usage +// results, err := PerformGoogleTextSearch("golang", "off", "lang_en", 2) +// if err != nil { +// log.Fatalf("Error performing search: %v", err) +// } + +// for _, result := range results { +// fmt.Printf("URL: %s\nHeader: %s\nDescription: %s\n", result.URL, result.Header, result.Description) +// } +// } + +func PerformGoogleTextSearch(query, safe, lang string, numPages int) ([]TextSearchResult, error) { + ctx, cancel := chromedp.NewContext(context.Background()) + defer cancel() + var results []TextSearchResult - client := &http.Client{} - searchURL := buildSearchURL(query, safe, lang, page, resultsPerPage) + searchURL := buildSearchURL(query, safe, lang, 1, 10) + + err := chromedp.Run(ctx, + chromedp.Navigate(searchURL), + ) - req, err := http.NewRequest("GET", searchURL, nil) if err != nil { - return nil, fmt.Errorf("failed to create request: %v", err) + return nil, fmt.Errorf("failed to navigate to search URL: %v", err) } - // User Agent generation - TextUserAgent, err := GetUserAgent("Text-Search") - if err != nil { - fmt.Println("Error:", err) - return nil, err - } - - if debugMode { - fmt.Println("Generated User Agent (text):", TextUserAgent) - } - - req.Header.Set("User-Agent", TextUserAgent) - - resp, err := client.Do(req) - if err != nil { - return nil, fmt.Errorf("making request: %v", err) - } - defer resp.Body.Close() - - if resp.StatusCode != http.StatusOK { - return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode) - } - - doc, err := goquery.NewDocumentFromReader(resp.Body) - if err != nil { - return nil, fmt.Errorf("loading HTML document: %v", err) - } - - results = parseResults(doc) - - if len(results) == 0 { - if debugMode { - log.Println("No results found from Google") + for page := 1; page <= numPages; page++ { + var pageSource string + err := chromedp.Run(ctx, + chromedp.Sleep(2*time.Second), + chromedp.OuterHTML("html", &pageSource), + chromedp.Evaluate(`window.scrollTo(0, document.body.scrollHeight);`, nil), + ) + if err != nil { + return nil, fmt.Errorf("failed to retrieve page source: %v", err) } + + newResults, err := parseResults(pageSource) + if err != nil { + return nil, fmt.Errorf("error parsing results: %v", err) + } + results = append(results, newResults...) } return results, nil @@ -72,20 +77,21 @@ func buildSearchURL(query, safe, lang string, page, resultsPerPage int) string { langParam = "&lr=" + lang } - startIndex := (page - 1) * resultsPerPage - return fmt.Sprintf("https://www.google.com/search?q=%s%s%s&udm=14&start=%d", url.QueryEscape(query), safeParam, langParam, startIndex) + return fmt.Sprintf("https://www.google.com/search?q=%s%s%s", url.QueryEscape(query), safeParam, langParam) } -func parseResults(doc *goquery.Document) []TextSearchResult { +func parseResults(pageSource string) ([]TextSearchResult, error) { var results []TextSearchResult + doc, err := goquery.NewDocumentFromReader(strings.NewReader(pageSource)) + if err != nil { + return nil, fmt.Errorf("loading HTML document: %v", err) + } + doc.Find(".yuRUbf").Each(func(i int, s *goquery.Selection) { link := s.Find("a") href, exists := link.Attr("href") if !exists { - if debugMode { - log.Printf("No href attribute found for result %d\n", i) - } return } @@ -104,10 +110,7 @@ func parseResults(doc *goquery.Document) []TextSearchResult { Description: description, } results = append(results, result) - if debugMode { - log.Printf("Google result: %+v\n", result) - } }) - return results + return results, nil } diff --git a/text-quant.go b/text-quant.go index c090ffe..de8b03a 100644 --- a/text-quant.go +++ b/text-quant.go @@ -3,7 +3,6 @@ package main import ( "encoding/json" "fmt" - "log" "net/http" "net/url" "time" @@ -27,11 +26,9 @@ type QwantTextAPIResponse struct { } // PerformQwantTextSearch contacts the Qwant API and returns a slice of TextSearchResult -func PerformQwantTextSearch(query, safe, lang string, page int) ([]TextSearchResult, error) { +func PerformQwantTextSearch(query, safe, lang string) ([]TextSearchResult, error) { const resultsPerPage = 10 - - // Calculate the offset based on the page number - offset := (page - 1) * resultsPerPage + const offset = 0 // Ensure safe search is disabled by default if not specified if safe == "" { @@ -43,12 +40,11 @@ func PerformQwantTextSearch(query, safe, lang string, page int) ([]TextSearchRes lang = "en_CA" } - apiURL := fmt.Sprintf("https://api.qwant.com/v3/search/web?q=%s&count=%d&locale=%s&offset=%d&device=desktop&safesearch=%s", + apiURL := fmt.Sprintf("https://api.qwant.com/v3/search/web?q=%s&count=%d&locale=%s&offset=%d&device=desktop", url.QueryEscape(query), resultsPerPage, lang, - offset, - safe) + offset) client := &http.Client{Timeout: 10 * time.Second} @@ -97,9 +93,6 @@ func PerformQwantTextSearch(query, safe, lang string, page int) ([]TextSearchRes func cleanQwantURL(rawURL string) string { u, err := url.Parse(rawURL) if err != nil { - if debugMode { - log.Printf("Error parsing URL: %v", err) - } return rawURL } return u.Scheme + "://" + u.Host + u.Path diff --git a/text.go b/text.go index 6845fab..6d7c727 100644 --- a/text.go +++ b/text.go @@ -135,7 +135,7 @@ func fetchAndCacheTextResults(query, safe, lang string, page, resultsPerPage int Source string }{ {PerformGoogleTextSearch, "Google"}, - {PerformLibreXTextSearch, "LibreX"}, + // {PerformLibreXTextSearch, "LibreX"}, // {PerformSearXNGTextSearch, "SearXNG"}, }