Initial commit
This commit is contained in:
152
internal/apify/client.go
Normal file
152
internal/apify/client.go
Normal file
@@ -0,0 +1,152 @@
|
||||
package apify
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
apiBase = "https://api.apify.com/v2"
|
||||
pollEvery = 3 * time.Second
|
||||
pollTimeout = 5 * time.Minute
|
||||
)
|
||||
|
||||
// Client is a thin wrapper around the Apify run-and-fetch lifecycle.
|
||||
type Client struct {
|
||||
APIKey string
|
||||
HTTP *http.Client
|
||||
}
|
||||
|
||||
func New(apiKey string) *Client {
|
||||
return &Client{
|
||||
APIKey: apiKey,
|
||||
HTTP: &http.Client{Timeout: 30 * time.Second},
|
||||
}
|
||||
}
|
||||
|
||||
type runResponse struct {
|
||||
Data struct {
|
||||
ID string `json:"id"`
|
||||
Status string `json:"status"`
|
||||
DefaultDatasetID string `json:"defaultDatasetId"`
|
||||
} `json:"data"`
|
||||
}
|
||||
|
||||
// Run starts an actor run, waits for SUCCEEDED, and returns dataset items as raw JSON.
|
||||
func (c *Client) Run(ctx context.Context, actorID string, input any) ([]json.RawMessage, error) {
|
||||
if c.APIKey == "" {
|
||||
return nil, errors.New("apify api_key not configured")
|
||||
}
|
||||
if actorID == "" {
|
||||
return nil, errors.New("apify actor id is empty")
|
||||
}
|
||||
|
||||
body, err := json.Marshal(input)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Apify URLs use "~" to separate username and actor name, never "/".
|
||||
// Accept either form in config and normalize before path-escaping.
|
||||
urlActorID := strings.ReplaceAll(actorID, "/", "~")
|
||||
startURL := fmt.Sprintf("%s/acts/%s/runs?token=%s", apiBase, url.PathEscape(urlActorID), url.QueryEscape(c.APIKey))
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, startURL, bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
resp, err := c.HTTP.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("start run: %w", err)
|
||||
}
|
||||
var runResp runResponse
|
||||
if err := decodeJSON(resp, &runResp); err != nil {
|
||||
return nil, fmt.Errorf("start run: %w", err)
|
||||
}
|
||||
if runResp.Data.ID == "" {
|
||||
return nil, errors.New("start run: missing run id")
|
||||
}
|
||||
|
||||
deadline := time.Now().Add(pollTimeout)
|
||||
pollCtx, cancel := context.WithDeadline(ctx, deadline)
|
||||
defer cancel()
|
||||
|
||||
status, datasetID, err := c.waitForRun(pollCtx, runResp.Data.ID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if status != "SUCCEEDED" {
|
||||
return nil, fmt.Errorf("apify run terminated with status %s", status)
|
||||
}
|
||||
|
||||
return c.fetchDataset(ctx, datasetID)
|
||||
}
|
||||
|
||||
func (c *Client) waitForRun(ctx context.Context, runID string) (string, string, error) {
|
||||
pollURL := fmt.Sprintf("%s/actor-runs/%s?token=%s", apiBase, url.PathEscape(runID), url.QueryEscape(c.APIKey))
|
||||
for {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, pollURL, nil)
|
||||
if err != nil {
|
||||
return "", "", err
|
||||
}
|
||||
resp, err := c.HTTP.Do(req)
|
||||
if err != nil {
|
||||
return "", "", fmt.Errorf("poll run: %w", err)
|
||||
}
|
||||
var r runResponse
|
||||
if err := decodeJSON(resp, &r); err != nil {
|
||||
return "", "", fmt.Errorf("poll run: %w", err)
|
||||
}
|
||||
switch r.Data.Status {
|
||||
case "SUCCEEDED", "FAILED", "ABORTED", "TIMED-OUT":
|
||||
return r.Data.Status, r.Data.DefaultDatasetID, nil
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return "", "", ctx.Err()
|
||||
case <-time.After(pollEvery):
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Client) fetchDataset(ctx context.Context, datasetID string) ([]json.RawMessage, error) {
|
||||
if datasetID == "" {
|
||||
return nil, errors.New("missing dataset id")
|
||||
}
|
||||
dsURL := fmt.Sprintf("%s/datasets/%s/items?clean=true&format=json&token=%s", apiBase, url.PathEscape(datasetID), url.QueryEscape(c.APIKey))
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, dsURL, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
resp, err := c.HTTP.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("fetch dataset: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode >= 300 {
|
||||
b, _ := io.ReadAll(io.LimitReader(resp.Body, 1024))
|
||||
return nil, fmt.Errorf("dataset returned %d: %s", resp.StatusCode, string(b))
|
||||
}
|
||||
var items []json.RawMessage
|
||||
if err := json.NewDecoder(resp.Body).Decode(&items); err != nil {
|
||||
return nil, fmt.Errorf("decode dataset: %w", err)
|
||||
}
|
||||
return items, nil
|
||||
}
|
||||
|
||||
func decodeJSON(resp *http.Response, dst any) error {
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode >= 300 {
|
||||
b, _ := io.ReadAll(io.LimitReader(resp.Body, 1024))
|
||||
return fmt.Errorf("http %d: %s", resp.StatusCode, string(b))
|
||||
}
|
||||
return json.NewDecoder(resp.Body).Decode(dst)
|
||||
}
|
||||
313
internal/apify/types.go
Normal file
313
internal/apify/types.go
Normal file
@@ -0,0 +1,313 @@
|
||||
package apify
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// ActiveListingInput is the input schema for `automation-lab/ebay-scraper`.
|
||||
// The actor accepts keyword searches and standard filters; it targets
|
||||
// ebay.com only (no per-marketplace routing in the actor itself), so
|
||||
// non-US marketplaces won't return useful results with this actor.
|
||||
type ActiveListingInput struct {
|
||||
SearchQueries []string `json:"searchQueries"`
|
||||
MaxProductsPerSearch int `json:"maxProductsPerSearch,omitempty"`
|
||||
MaxSearchPages int `json:"maxSearchPages,omitempty"`
|
||||
Sort string `json:"sort,omitempty"`
|
||||
ListingType string `json:"listingType,omitempty"`
|
||||
Condition []string `json:"condition,omitempty"`
|
||||
MinPrice *int `json:"minPrice,omitempty"`
|
||||
MaxPrice *int `json:"maxPrice,omitempty"`
|
||||
ProxyConfiguration *ProxyConfiguration `json:"proxyConfiguration,omitempty"`
|
||||
}
|
||||
|
||||
// ProxyConfiguration is the standard apify input block for proxy routing.
|
||||
// eBay (and most retail sites) return 403 to datacenter IPs; passing
|
||||
// {"useApifyProxy": true, "apifyProxyGroups": ["RESIDENTIAL"]} works.
|
||||
type ProxyConfiguration struct {
|
||||
UseApifyProxy bool `json:"useApifyProxy"`
|
||||
ApifyProxyGroups []string `json:"apifyProxyGroups,omitempty"`
|
||||
ApifyProxyCountry string `json:"apifyProxyCountry,omitempty"`
|
||||
}
|
||||
|
||||
// ActiveListingResult is decoded leniently to handle multiple eBay-scraper
|
||||
// actors. delicious_zebu/ebay-product-listing-scraper returns productUrl /
|
||||
// imageUrl / numeric price; harvestlab/ebay-scraper used url / price /
|
||||
// currency. The decoder coalesces both shapes.
|
||||
type ActiveListingResult struct {
|
||||
Title string `json:"title"`
|
||||
Price any `json:"price"`
|
||||
OriginalPrice any `json:"originalPrice"`
|
||||
Currency string `json:"currency"`
|
||||
URL string `json:"url"`
|
||||
ProductURL string `json:"productUrl"`
|
||||
Store string `json:"store"`
|
||||
ImageURL string `json:"imageUrl"`
|
||||
Image string `json:"image"`
|
||||
Thumbnail string `json:"thumbnail"`
|
||||
Images []string `json:"images"`
|
||||
Condition string `json:"condition"`
|
||||
ListingType string `json:"listingType"`
|
||||
ShippingCost any `json:"shippingCost"`
|
||||
ShippingPrice any `json:"shippingPrice"`
|
||||
FreeShipping bool `json:"freeShipping"`
|
||||
Marketplace string `json:"marketplace"`
|
||||
MatchConfidence float64 `json:"matchConfidence"`
|
||||
Availability string `json:"availability"`
|
||||
WatchersCount int `json:"watchersCount"`
|
||||
QuantitySold int `json:"quantitySold"`
|
||||
}
|
||||
|
||||
type SoldListingInput struct {
|
||||
Query string `json:"query"`
|
||||
Marketplace string `json:"marketplace,omitempty"`
|
||||
MaxResults int `json:"maxResults,omitempty"`
|
||||
DaysBack int `json:"daysBack,omitempty"`
|
||||
ProxyConfiguration *ProxyConfiguration `json:"proxyConfiguration,omitempty"`
|
||||
}
|
||||
|
||||
type SoldListingResult struct {
|
||||
Title string `json:"title"`
|
||||
SoldPrice float64 `json:"soldPrice"`
|
||||
Currency string `json:"soldCurrency"`
|
||||
SoldAt string `json:"endedAt"`
|
||||
Condition string `json:"condition"`
|
||||
ListingType string `json:"listingType"`
|
||||
ShippingPrice float64 `json:"shippingPrice"`
|
||||
URL string `json:"url"`
|
||||
}
|
||||
|
||||
type PriceComparisonInput struct {
|
||||
Query string `json:"query,omitempty"`
|
||||
URL string `json:"url,omitempty"`
|
||||
MatchStrictness string `json:"matchStrictness,omitempty"`
|
||||
ProxyConfiguration *ProxyConfiguration `json:"proxyConfiguration,omitempty"`
|
||||
}
|
||||
|
||||
type PriceComparisonResult struct {
|
||||
Title string `json:"title"`
|
||||
Price float64 `json:"price"`
|
||||
Currency string `json:"currency"`
|
||||
URL string `json:"url"`
|
||||
Store string `json:"store"`
|
||||
ImageURL string `json:"imageUrl"`
|
||||
Availability string `json:"availability"`
|
||||
MatchConfidence float64 `json:"matchConfidence"`
|
||||
}
|
||||
|
||||
// YahooAuctionsJPInput targets meron1122/zenmarket-scraper. ZenMarket is a
|
||||
// buyer-proxy for Yahoo Auctions JP; its scraper returns ZenMarket-proxied
|
||||
// listing URLs and USD-converted prices.
|
||||
type YahooAuctionsJPInput struct {
|
||||
SearchTerm string `json:"searchTerm"`
|
||||
CategoryID string `json:"categoryID,omitempty"`
|
||||
MaxPages int `json:"maxPages,omitempty"`
|
||||
MaxRemainingHours int `json:"maxRemainingHours,omitempty"`
|
||||
}
|
||||
|
||||
// MercariJPInput targets cloud9_ai/mercari-scraper. The actor manages its
|
||||
// own proxy (Japan datacenter with residential fallback), so we do not send
|
||||
// a proxyConfiguration block.
|
||||
type MercariJPInput struct {
|
||||
SearchKeywords []string `json:"searchKeywords,omitempty"`
|
||||
ProductUrls []string `json:"productUrls,omitempty"`
|
||||
Status string `json:"status,omitempty"`
|
||||
SortBy string `json:"sortBy,omitempty"`
|
||||
PriceMin *int `json:"priceMin,omitempty"`
|
||||
PriceMax *int `json:"priceMax,omitempty"`
|
||||
ItemCondition string `json:"itemCondition,omitempty"`
|
||||
MaxResults int `json:"maxResults,omitempty"`
|
||||
}
|
||||
|
||||
// YahooAuctionsJPResult matches meron1122/zenmarket-scraper output. Prices
|
||||
// are USD-converted at the ZenMarket-published rate.
|
||||
type YahooAuctionsJPResult struct {
|
||||
Name string `json:"name"`
|
||||
CurrentPrice any `json:"current_price"`
|
||||
Photos []string `json:"photos"`
|
||||
URL string `json:"url"`
|
||||
EndingDate string `json:"ending_date"`
|
||||
}
|
||||
|
||||
// UnifiedResult is the common shape produced by ParseResults regardless of
|
||||
// which actor type returned the data. The scheduler consumes this.
|
||||
type UnifiedResult struct {
|
||||
Title string
|
||||
Price float64
|
||||
Currency string
|
||||
URL string
|
||||
Store string
|
||||
ImageURL string
|
||||
Source string
|
||||
MatchConfidence float64
|
||||
OutOfStock bool
|
||||
// MatchedQuery records which alias from the item's query list produced
|
||||
// this row. Empty for URL-only items or rows from non-search sources.
|
||||
MatchedQuery string
|
||||
}
|
||||
|
||||
// Decode unmarshals a list of raw JSON items into UnifiedResult slices using
|
||||
// the shape that matches the given source label.
|
||||
func Decode(items []json.RawMessage, source string) ([]UnifiedResult, error) {
|
||||
out := make([]UnifiedResult, 0, len(items))
|
||||
switch source {
|
||||
case SourceActiveEbay, SourcePriceCompare:
|
||||
for _, raw := range items {
|
||||
var r ActiveListingResult
|
||||
if err := json.Unmarshal(raw, &r); err != nil {
|
||||
continue
|
||||
}
|
||||
url := r.URL
|
||||
if url == "" {
|
||||
url = r.ProductURL
|
||||
}
|
||||
img := r.ImageURL
|
||||
if img == "" {
|
||||
img = r.Image
|
||||
}
|
||||
if img == "" {
|
||||
img = r.Thumbnail
|
||||
}
|
||||
if img == "" && len(r.Images) > 0 {
|
||||
img = r.Images[0]
|
||||
}
|
||||
store := r.Store
|
||||
if store == "" {
|
||||
store = r.Marketplace
|
||||
}
|
||||
if store == "" && source == SourceActiveEbay {
|
||||
store = "ebay"
|
||||
}
|
||||
cur := r.Currency
|
||||
if cur == "" {
|
||||
cur = "USD"
|
||||
}
|
||||
out = append(out, UnifiedResult{
|
||||
Title: r.Title,
|
||||
Price: coercePrice(r.Price),
|
||||
Currency: cur,
|
||||
URL: url,
|
||||
Store: store,
|
||||
ImageURL: img,
|
||||
Source: source,
|
||||
MatchConfidence: r.MatchConfidence,
|
||||
OutOfStock: isOOS(r.Availability),
|
||||
})
|
||||
}
|
||||
case SourceYahooJP:
|
||||
for _, raw := range items {
|
||||
var r YahooAuctionsJPResult
|
||||
if err := json.Unmarshal(raw, &r); err != nil {
|
||||
continue
|
||||
}
|
||||
img := ""
|
||||
if len(r.Photos) > 0 {
|
||||
img = r.Photos[0]
|
||||
}
|
||||
out = append(out, UnifiedResult{
|
||||
Title: r.Name,
|
||||
Price: coercePrice(r.CurrentPrice),
|
||||
Currency: "USD",
|
||||
URL: r.URL,
|
||||
Store: "yahoo-auctions-jp (via zenmarket)",
|
||||
ImageURL: img,
|
||||
Source: source,
|
||||
})
|
||||
}
|
||||
case SourceMercariJP:
|
||||
// Mercari actors vary in shape; accept either price/currentPrice and title/name.
|
||||
for _, raw := range items {
|
||||
var generic struct {
|
||||
Title string `json:"title"`
|
||||
Name string `json:"name"`
|
||||
Price float64 `json:"price"`
|
||||
CurrentPrice float64 `json:"currentPrice"`
|
||||
Currency string `json:"currency"`
|
||||
URL string `json:"url"`
|
||||
ImageURL string `json:"imageUrl"`
|
||||
Image string `json:"image"`
|
||||
Status string `json:"status"`
|
||||
}
|
||||
if err := json.Unmarshal(raw, &generic); err != nil {
|
||||
continue
|
||||
}
|
||||
title := generic.Title
|
||||
if title == "" {
|
||||
title = generic.Name
|
||||
}
|
||||
price := generic.Price
|
||||
if price == 0 {
|
||||
price = generic.CurrentPrice
|
||||
}
|
||||
img := generic.ImageURL
|
||||
if img == "" {
|
||||
img = generic.Image
|
||||
}
|
||||
cur := generic.Currency
|
||||
if cur == "" {
|
||||
cur = "JPY"
|
||||
}
|
||||
out = append(out, UnifiedResult{
|
||||
Title: title,
|
||||
Price: price,
|
||||
Currency: cur,
|
||||
URL: generic.URL,
|
||||
Store: "mercari-jp",
|
||||
ImageURL: img,
|
||||
Source: source,
|
||||
OutOfStock: isOOS(generic.Status),
|
||||
})
|
||||
}
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
const (
|
||||
SourceActiveEbay = "ebay"
|
||||
SourcePriceCompare = "price-comparison"
|
||||
SourceYahooJP = "yahoo-auctions-jp"
|
||||
SourceMercariJP = "mercari-jp"
|
||||
SourceSoldEbay = "ebay-sold"
|
||||
SourceSoldYahooJP = "yahoo-auctions-jp-sold"
|
||||
)
|
||||
|
||||
// coercePrice accepts a price field that might be a number or a string with
|
||||
// currency symbols / commas (e.g. "$24.99", "1,299.00"). Returns 0 on failure
|
||||
// so FilterResults can drop the row cleanly.
|
||||
func coercePrice(v any) float64 {
|
||||
switch x := v.(type) {
|
||||
case nil:
|
||||
return 0
|
||||
case float64:
|
||||
return x
|
||||
case float32:
|
||||
return float64(x)
|
||||
case int:
|
||||
return float64(x)
|
||||
case int64:
|
||||
return float64(x)
|
||||
case string:
|
||||
s := strings.Map(func(r rune) rune {
|
||||
switch {
|
||||
case r >= '0' && r <= '9', r == '.', r == '-':
|
||||
return r
|
||||
}
|
||||
return -1
|
||||
}, x)
|
||||
f, err := strconv.ParseFloat(s, 64)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
return f
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func isOOS(s string) bool {
|
||||
switch s {
|
||||
case "out_of_stock", "OUT_OF_STOCK", "sold", "SOLD", "ended":
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
Reference in New Issue
Block a user