v0.2.0 #11

Merged
partisan merged 11 commits from work into main 2024-09-09 15:32:06 +00:00
3 changed files with 239 additions and 1 deletions
Showing only changes of commit 56ce016db9 - Show all commits

237
images-deviantart.go Normal file
View file

@ -0,0 +1,237 @@
package main
import (
"fmt"
"net/http"
"net/url"
"strings"
"sync"
"time"
"github.com/PuerkitoBio/goquery"
)
// NextPageCache is a specialized cache for storing next page links
type NextPageCache struct {
mu sync.Mutex
links map[string]string
expiration time.Duration
}
// NewNextPageCache creates a new NextPageCache with a specified expiration duration
func NewNextPageCache(expiration time.Duration) *NextPageCache {
return &NextPageCache{
links: make(map[string]string),
expiration: expiration,
}
}
// Get retrieves the next page link for a given key from the cache
func (npc *NextPageCache) Get(key CacheKey) (string, bool) {
npc.mu.Lock()
defer npc.mu.Unlock()
link, exists := npc.links[npc.keyToString(key)]
if !exists {
return "", false
}
return link, true
}
// Set stores the next page link for a given key in the cache
// Idk it maybye worth it to use "cache.go" for this
func (npc *NextPageCache) Set(key CacheKey, link string) {
npc.mu.Lock()
defer npc.mu.Unlock()
npc.links[npc.keyToString(key)] = link
}
// keyToString converts a CacheKey to a string representation
func (npc *NextPageCache) keyToString(key CacheKey) string {
return fmt.Sprintf("%s|%d|%t|%s|%s", key.Query, key.Page, key.Safe, key.Lang, key.Type)
}
var (
nextPageCache = NewNextPageCache(6 * time.Hour) // Cache with 6-hour expiration
)
// PerformDeviantArtImageSearch performs a search on DeviantArt and returns a list of image results
func PerformDeviantArtImageSearch(query, safe, lang string, page int) ([]ImageSearchResult, time.Duration, error) {
startTime := time.Now()
cacheKey := CacheKey{
Query: query,
Page: page,
Safe: safe == "active",
Lang: lang,
Type: "deviantart",
}
// Check if the next page link is cached
var searchURL string
if page > 1 {
if nextPageLink, found := nextPageCache.Get(cacheKey); found {
searchURL = nextPageLink
} else {
return nil, 0, fmt.Errorf("next page link not found in cache")
}
} else {
searchURL = buildDeviantArtSearchURL(query, page)
}
// Get the User-Agent string
DeviantArtImageUserAgent, err := GetUserAgent("Image-Search-DeviantArt")
if err != nil {
return nil, 0, err
}
// Make the HTTP request with User-Agent header
client := &http.Client{}
req, err := http.NewRequest("GET", searchURL, nil)
if err != nil {
return nil, 0, fmt.Errorf("creating request: %v", err)
}
req.Header.Set("User-Agent", DeviantArtImageUserAgent)
resp, err := client.Do(req)
if err != nil {
return nil, 0, fmt.Errorf("making request: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, 0, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
}
// Parse the HTML document
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return nil, 0, fmt.Errorf("loading HTML document: %v", err)
}
// Channel to receive valid image results
resultsChan := make(chan ImageSearchResult)
var wg sync.WaitGroup
// Extract data using goquery
doc.Find("div._2pZkk div div a").Each(func(i int, s *goquery.Selection) {
// Skip images that are blurred (premium content)
premiumText := s.Find("../div/div/div").Text()
if strings.Contains(premiumText, "Watch the artist to view this deviation") {
return
}
// Extract image source, fallback on data-src if necessary
imgSrc, exists := s.Find("div img").Attr("srcset")
if !exists {
imgSrc, exists = s.Find("div img").Attr("data-src")
}
if !exists || imgSrc == "" {
return
}
imgSrc = strings.Split(imgSrc, " ")[0]
parsedURL, err := url.Parse(imgSrc)
if err == nil {
parts := strings.Split(parsedURL.Path, "/v1")
parsedURL.Path = parts[0]
imgSrc = parsedURL.String()
}
// Extract URL and title
resultURL := s.AttrOr("href", "")
title := s.AttrOr("aria-label", "")
// Only proceed if title, URL, and img_src are not empty
if title != "" && resultURL != "" && imgSrc != "" {
wg.Add(1)
go func(imgSrc, resultURL, title string) {
defer wg.Done()
// Verify if the image URL is accessible
if isValidImageURL(imgSrc, DeviantArtImageUserAgent, resultURL) {
resultsChan <- ImageSearchResult{
Title: strings.TrimSpace(title),
Media: imgSrc,
Width: 0,
Height: 0,
Source: resultURL,
ThumbProxy: imgSrc,
}
}
}(imgSrc, resultURL, title)
}
})
// Close the results channel when all goroutines are done
go func() {
wg.Wait()
close(resultsChan)
}()
// Collect results from the channel
var results []ImageSearchResult
for result := range resultsChan {
results = append(results, result)
}
// Cache the next page link, if any
nextPageLink := doc.Find("a._1OGeq").Last().AttrOr("href", "")
if nextPageLink != "" {
nextPageCache.Set(cacheKey, nextPageLink)
}
duration := time.Since(startTime)
// Check if the number of results is one or less
if len(results) == 0 {
return nil, duration, fmt.Errorf("no images found")
}
return results, duration, nil
}
// buildDeviantArtSearchURL builds the search URL for DeviantArt
func buildDeviantArtSearchURL(query string, page int) string {
baseURL := "https://www.deviantart.com/search"
params := url.Values{}
params.Add("q", query)
return baseURL + "?" + params.Encode()
}
// isValidImageURL checks if the image URL is accessible with the provided User-Agent
func isValidImageURL(imgSrc, userAgent, referer string) bool {
client := &http.Client{}
req, err := http.NewRequest("HEAD", imgSrc, nil)
if err != nil {
return false
}
// Set headers to mimic a regular browser request
req.Header.Set("User-Agent", userAgent)
req.Header.Set("Referer", referer)
resp, err := client.Do(req)
if err != nil {
return false
}
defer resp.Body.Close()
return resp.StatusCode == http.StatusOK
}
// // Example usage:
// func main() {
// results, duration, err := PerformDeviantArtImageSearch("kittens", "false", "en", 1)
// if err != nil {
// fmt.Println("Error:", err)
// return
// }
// fmt.Printf("Search took: %v\n", duration)
// fmt.Printf("Total results: %d\n", len(results))
// for _, result := range results {
// fmt.Printf("Title: %s\nThumbnail: %s\nMedia: %s\nSource (Original Image URL): %s\n\n",
// result.Title, result.Thumbnail, result.Media, result.Source)
// }
// }

View file

@ -58,7 +58,7 @@ func PerformQwantImageSearch(query, safe, lang string, page int) ([]ImageSearchR
return nil, 0, fmt.Errorf("creating request: %v", err)
}
ImageUserAgent, err := GetUserAgent("Image-Search")
ImageUserAgent, err := GetUserAgent("Image-Search-Quant")
if err != nil {
return nil, 0, err
}

View file

@ -13,6 +13,7 @@ var imageSearchEngines []SearchEngine
func init() {
imageSearchEngines = []SearchEngine{
{Name: "Qwant", Func: wrapImageSearchFunc(PerformQwantImageSearch), Weight: 1},
{Name: "DeviantArt", Func: wrapImageSearchFunc(PerformDeviantArtImageSearch), Weight: 2},
{Name: "Bing", Func: wrapImageSearchFunc(PerformBingImageSearch), Weight: 2}, // Bing sometimes returns with low amount of images, this leads to danamica page loading not working
{Name: "Imgur", Func: wrapImageSearchFunc(PerformImgurImageSearch), Weight: 3},
}