wip, revert changes

This commit is contained in:
partisan 2024-06-09 12:43:46 +02:00
parent 606cae2dc9
commit 5614da4648
7 changed files with 103 additions and 144 deletions

View file

@ -128,10 +128,10 @@ func isInstanceValid(instance SearXInstance) bool {
} }
} }
func main() { // func main() {
instance, err := getRandomSearXInstance() // instance, err := getRandomSearXInstance()
if err != nil { // if err != nil {
log.Fatalf("Failed to get a SearX instance: %v", err) // log.Fatalf("Failed to get a SearX instance: %v", err)
} // }
fmt.Printf("Selected SearX instance: %s\n", instance.URL) // fmt.Printf("Selected SearX instance: %s\n", instance.URL)
} // }

12
go.mod
View file

@ -2,8 +2,18 @@ module searchengine
go 1.18 go 1.18
require github.com/PuerkitoBio/goquery v1.9.1 // direct
require ( require (
github.com/PuerkitoBio/goquery v1.9.1 // direct
github.com/andybalholm/cascadia v1.3.2 // indirect github.com/andybalholm/cascadia v1.3.2 // indirect
github.com/chromedp/cdproto v0.0.0-20240202021202-6d0b6a386732 // indirect
github.com/chromedp/chromedp v0.9.5 // indirect
github.com/chromedp/sysutil v1.0.0 // indirect
github.com/gobwas/httphead v0.1.0 // indirect
github.com/gobwas/pool v0.2.1 // indirect
github.com/gobwas/ws v1.3.2 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
golang.org/x/net v0.21.0 // indirect golang.org/x/net v0.21.0 // indirect
golang.org/x/sys v0.17.0 // indirect
) )

22
go.sum
View file

@ -2,6 +2,24 @@ github.com/PuerkitoBio/goquery v1.9.1 h1:mTL6XjbJTZdpfL+Gwl5U2h1l9yEkJjhmlTeV9VP
github.com/PuerkitoBio/goquery v1.9.1/go.mod h1:cW1n6TmIMDoORQU5IU/P1T3tGFunOeXEpGP2WHRwkbY= github.com/PuerkitoBio/goquery v1.9.1/go.mod h1:cW1n6TmIMDoORQU5IU/P1T3tGFunOeXEpGP2WHRwkbY=
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
github.com/chromedp/cdproto v0.0.0-20240202021202-6d0b6a386732 h1:XYUCaZrW8ckGWlCRJKCSoh/iFwlpX316a8yY9IFEzv8=
github.com/chromedp/cdproto v0.0.0-20240202021202-6d0b6a386732/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs=
github.com/chromedp/chromedp v0.9.5 h1:viASzruPJOiThk7c5bueOUY91jGLJVximoEMGoH93rg=
github.com/chromedp/chromedp v0.9.5/go.mod h1:D4I2qONslauw/C7INoCir1BJkSwBYMyZgx8X276z3+Y=
github.com/chromedp/sysutil v1.0.0 h1:+ZxhTpfpZlmchB58ih/LBHX52ky7w2VhQVKQMucy3Ic=
github.com/chromedp/sysutil v1.0.0/go.mod h1:kgWmDdq8fTzXYcKIBqIYvRRTnYb9aNS9moAV0xufSww=
github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU=
github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM=
github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og=
github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
github.com/gobwas/ws v1.3.2 h1:zlnbNHxumkRvfPWgfXu8RBwyNR1x8wh9cf5PTOCqs9Q=
github.com/gobwas/ws v1.3.2/go.mod h1:hRKAFb8wOxFROYNsT1bqfWnhX+b5MFeJM9r2ZSwg/KY=
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs=
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
@ -23,7 +41,11 @@ golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.17.0 h1:25cE3gD+tdBA7lp7QfhuV+rJiE9YXTcS3VG1SqssI/Y=
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=

View file

@ -1,3 +1,4 @@
// text-duckduckgo.go
package main package main
import ( import (
@ -6,66 +7,15 @@ import (
"net/http" "net/http"
"net/url" "net/url"
"strings" "strings"
"time"
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
) )
const ( func PerformDuckDuckGoTextSearch(query, safe, lang string) ([]TextSearchResult, error) {
resultsPerPage = 10
)
func getVQD(query string) (string, error) {
queryURL := fmt.Sprintf("https://duckduckgo.com/?q=%s", url.QueryEscape(query))
resp, err := http.Get(queryURL)
if err != nil {
return "", fmt.Errorf("failed to fetch vqd: %v", err)
}
defer resp.Body.Close()
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return "", fmt.Errorf("loading HTML document: %v", err)
}
var vqd string
doc.Find("script").Each(func(i int, s *goquery.Selection) {
text := s.Text()
if strings.Contains(text, "vqd=\"") {
start := strings.Index(text, "vqd=\"") + 5
end := strings.Index(text[start:], "\"")
vqd = text[start : start+end]
}
})
if vqd == "" {
return "", fmt.Errorf("vqd not found")
}
return vqd, nil
}
func PerformDuckDuckGoTextSearch(query, safe, lang string, page int) ([]TextSearchResult, error) {
var results []TextSearchResult var results []TextSearchResult
searchURL := fmt.Sprintf("https://duckduckgo.com/html/?q=%s", url.QueryEscape(query))
client := &http.Client{Timeout: 10 * time.Second} resp, err := http.Get(searchURL)
vqd, err := getVQD(query)
if err != nil {
return nil, fmt.Errorf("failed to get vqd: %v", err)
}
searchURL := fmt.Sprintf("https://duckduckgo.com/html/?q=%s&kl=%s&safe=%s&s=%d&vqd=%s",
url.QueryEscape(query), lang, safe, (page-1)*resultsPerPage, vqd)
req, err := http.NewRequest("GET", searchURL, nil)
if err != nil {
return nil, fmt.Errorf("failed to create request: %v", err)
}
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36")
resp, err := client.Do(req)
if err != nil { if err != nil {
return nil, fmt.Errorf("making request: %v", err) return nil, fmt.Errorf("making request: %v", err)
} }
@ -94,34 +44,15 @@ func PerformDuckDuckGoTextSearch(query, safe, lang string, page int) ([]TextSear
URL: uddg, URL: uddg,
Header: strings.TrimSpace(header), Header: strings.TrimSpace(header),
Description: strings.TrimSpace(description), Description: strings.TrimSpace(description),
Source: "DuckDuckGo",
} }
results = append(results, result) results = append(results, result)
if debugMode { if debugMode {
log.Printf("Processed DuckDuckGo result: %+v\n", result) log.Printf("Processed DuckDuckGo result: %+v\n", result)
} }
} else {
if debugMode {
log.Printf("Missing 'uddg' parameter in URL: %s\n", rawURL)
} }
} }
} else {
if debugMode {
log.Printf("Error parsing URL: %s, error: %v\n", rawURL, err)
}
}
} else {
if debugMode {
log.Printf("Missing 'href' attribute in result anchor tag\n")
}
} }
}) })
if len(results) == 0 {
if debugMode {
log.Println("No results found from DuckDuckGo")
}
}
return results, nil return results, nil
} }

View file

@ -1,61 +1,66 @@
package main package main
import ( import (
"context"
"fmt" "fmt"
"log"
"net/http"
"net/url" "net/url"
"strings" "strings"
"time"
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
"github.com/chromedp/chromedp"
) )
func PerformGoogleTextSearch(query, safe, lang string, page int) ([]TextSearchResult, error) { // type TextSearchResult struct {
const resultsPerPage = 10 // URL string
// Header string
// Description string
// }
// func main() {
// // Example usage
// results, err := PerformGoogleTextSearch("golang", "off", "lang_en", 2)
// if err != nil {
// log.Fatalf("Error performing search: %v", err)
// }
// for _, result := range results {
// fmt.Printf("URL: %s\nHeader: %s\nDescription: %s\n", result.URL, result.Header, result.Description)
// }
// }
func PerformGoogleTextSearch(query, safe, lang string, numPages int) ([]TextSearchResult, error) {
ctx, cancel := chromedp.NewContext(context.Background())
defer cancel()
var results []TextSearchResult var results []TextSearchResult
client := &http.Client{} searchURL := buildSearchURL(query, safe, lang, 1, 10)
searchURL := buildSearchURL(query, safe, lang, page, resultsPerPage)
err := chromedp.Run(ctx,
chromedp.Navigate(searchURL),
)
req, err := http.NewRequest("GET", searchURL, nil)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to create request: %v", err) return nil, fmt.Errorf("failed to navigate to search URL: %v", err)
} }
// User Agent generation for page := 1; page <= numPages; page++ {
TextUserAgent, err := GetUserAgent("Text-Search") var pageSource string
err := chromedp.Run(ctx,
chromedp.Sleep(2*time.Second),
chromedp.OuterHTML("html", &pageSource),
chromedp.Evaluate(`window.scrollTo(0, document.body.scrollHeight);`, nil),
)
if err != nil { if err != nil {
fmt.Println("Error:", err) return nil, fmt.Errorf("failed to retrieve page source: %v", err)
return nil, err
} }
if debugMode { newResults, err := parseResults(pageSource)
fmt.Println("Generated User Agent (text):", TextUserAgent)
}
req.Header.Set("User-Agent", TextUserAgent)
resp, err := client.Do(req)
if err != nil { if err != nil {
return nil, fmt.Errorf("making request: %v", err) return nil, fmt.Errorf("error parsing results: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
}
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return nil, fmt.Errorf("loading HTML document: %v", err)
}
results = parseResults(doc)
if len(results) == 0 {
if debugMode {
log.Println("No results found from Google")
} }
results = append(results, newResults...)
} }
return results, nil return results, nil
@ -72,20 +77,21 @@ func buildSearchURL(query, safe, lang string, page, resultsPerPage int) string {
langParam = "&lr=" + lang langParam = "&lr=" + lang
} }
startIndex := (page - 1) * resultsPerPage return fmt.Sprintf("https://www.google.com/search?q=%s%s%s", url.QueryEscape(query), safeParam, langParam)
return fmt.Sprintf("https://www.google.com/search?q=%s%s%s&udm=14&start=%d", url.QueryEscape(query), safeParam, langParam, startIndex)
} }
func parseResults(doc *goquery.Document) []TextSearchResult { func parseResults(pageSource string) ([]TextSearchResult, error) {
var results []TextSearchResult var results []TextSearchResult
doc, err := goquery.NewDocumentFromReader(strings.NewReader(pageSource))
if err != nil {
return nil, fmt.Errorf("loading HTML document: %v", err)
}
doc.Find(".yuRUbf").Each(func(i int, s *goquery.Selection) { doc.Find(".yuRUbf").Each(func(i int, s *goquery.Selection) {
link := s.Find("a") link := s.Find("a")
href, exists := link.Attr("href") href, exists := link.Attr("href")
if !exists { if !exists {
if debugMode {
log.Printf("No href attribute found for result %d\n", i)
}
return return
} }
@ -104,10 +110,7 @@ func parseResults(doc *goquery.Document) []TextSearchResult {
Description: description, Description: description,
} }
results = append(results, result) results = append(results, result)
if debugMode {
log.Printf("Google result: %+v\n", result)
}
}) })
return results return results, nil
} }

View file

@ -3,7 +3,6 @@ package main
import ( import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"log"
"net/http" "net/http"
"net/url" "net/url"
"time" "time"
@ -27,11 +26,9 @@ type QwantTextAPIResponse struct {
} }
// PerformQwantTextSearch contacts the Qwant API and returns a slice of TextSearchResult // PerformQwantTextSearch contacts the Qwant API and returns a slice of TextSearchResult
func PerformQwantTextSearch(query, safe, lang string, page int) ([]TextSearchResult, error) { func PerformQwantTextSearch(query, safe, lang string) ([]TextSearchResult, error) {
const resultsPerPage = 10 const resultsPerPage = 10
const offset = 0
// Calculate the offset based on the page number
offset := (page - 1) * resultsPerPage
// Ensure safe search is disabled by default if not specified // Ensure safe search is disabled by default if not specified
if safe == "" { if safe == "" {
@ -43,12 +40,11 @@ func PerformQwantTextSearch(query, safe, lang string, page int) ([]TextSearchRes
lang = "en_CA" lang = "en_CA"
} }
apiURL := fmt.Sprintf("https://api.qwant.com/v3/search/web?q=%s&count=%d&locale=%s&offset=%d&device=desktop&safesearch=%s", apiURL := fmt.Sprintf("https://api.qwant.com/v3/search/web?q=%s&count=%d&locale=%s&offset=%d&device=desktop",
url.QueryEscape(query), url.QueryEscape(query),
resultsPerPage, resultsPerPage,
lang, lang,
offset, offset)
safe)
client := &http.Client{Timeout: 10 * time.Second} client := &http.Client{Timeout: 10 * time.Second}
@ -97,9 +93,6 @@ func PerformQwantTextSearch(query, safe, lang string, page int) ([]TextSearchRes
func cleanQwantURL(rawURL string) string { func cleanQwantURL(rawURL string) string {
u, err := url.Parse(rawURL) u, err := url.Parse(rawURL)
if err != nil { if err != nil {
if debugMode {
log.Printf("Error parsing URL: %v", err)
}
return rawURL return rawURL
} }
return u.Scheme + "://" + u.Host + u.Path return u.Scheme + "://" + u.Host + u.Path

View file

@ -135,7 +135,7 @@ func fetchAndCacheTextResults(query, safe, lang string, page, resultsPerPage int
Source string Source string
}{ }{
{PerformGoogleTextSearch, "Google"}, {PerformGoogleTextSearch, "Google"},
{PerformLibreXTextSearch, "LibreX"}, // {PerformLibreXTextSearch, "LibreX"},
// {PerformSearXNGTextSearch, "SearXNG"}, // {PerformSearXNGTextSearch, "SearXNG"},
} }