Initial commit

This commit is contained in:
2026-05-13 19:42:49 -07:00
commit cfa01bd4ef
54 changed files with 11718 additions and 0 deletions

152
internal/apify/client.go Normal file
View File

@@ -0,0 +1,152 @@
package apify
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"strings"
"time"
)
const (
apiBase = "https://api.apify.com/v2"
pollEvery = 3 * time.Second
pollTimeout = 5 * time.Minute
)
// Client is a thin wrapper around the Apify run-and-fetch lifecycle.
type Client struct {
APIKey string
HTTP *http.Client
}
func New(apiKey string) *Client {
return &Client{
APIKey: apiKey,
HTTP: &http.Client{Timeout: 30 * time.Second},
}
}
type runResponse struct {
Data struct {
ID string `json:"id"`
Status string `json:"status"`
DefaultDatasetID string `json:"defaultDatasetId"`
} `json:"data"`
}
// Run starts an actor run, waits for SUCCEEDED, and returns dataset items as raw JSON.
func (c *Client) Run(ctx context.Context, actorID string, input any) ([]json.RawMessage, error) {
if c.APIKey == "" {
return nil, errors.New("apify api_key not configured")
}
if actorID == "" {
return nil, errors.New("apify actor id is empty")
}
body, err := json.Marshal(input)
if err != nil {
return nil, err
}
// Apify URLs use "~" to separate username and actor name, never "/".
// Accept either form in config and normalize before path-escaping.
urlActorID := strings.ReplaceAll(actorID, "/", "~")
startURL := fmt.Sprintf("%s/acts/%s/runs?token=%s", apiBase, url.PathEscape(urlActorID), url.QueryEscape(c.APIKey))
req, err := http.NewRequestWithContext(ctx, http.MethodPost, startURL, bytes.NewReader(body))
if err != nil {
return nil, err
}
req.Header.Set("Content-Type", "application/json")
resp, err := c.HTTP.Do(req)
if err != nil {
return nil, fmt.Errorf("start run: %w", err)
}
var runResp runResponse
if err := decodeJSON(resp, &runResp); err != nil {
return nil, fmt.Errorf("start run: %w", err)
}
if runResp.Data.ID == "" {
return nil, errors.New("start run: missing run id")
}
deadline := time.Now().Add(pollTimeout)
pollCtx, cancel := context.WithDeadline(ctx, deadline)
defer cancel()
status, datasetID, err := c.waitForRun(pollCtx, runResp.Data.ID)
if err != nil {
return nil, err
}
if status != "SUCCEEDED" {
return nil, fmt.Errorf("apify run terminated with status %s", status)
}
return c.fetchDataset(ctx, datasetID)
}
func (c *Client) waitForRun(ctx context.Context, runID string) (string, string, error) {
pollURL := fmt.Sprintf("%s/actor-runs/%s?token=%s", apiBase, url.PathEscape(runID), url.QueryEscape(c.APIKey))
for {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, pollURL, nil)
if err != nil {
return "", "", err
}
resp, err := c.HTTP.Do(req)
if err != nil {
return "", "", fmt.Errorf("poll run: %w", err)
}
var r runResponse
if err := decodeJSON(resp, &r); err != nil {
return "", "", fmt.Errorf("poll run: %w", err)
}
switch r.Data.Status {
case "SUCCEEDED", "FAILED", "ABORTED", "TIMED-OUT":
return r.Data.Status, r.Data.DefaultDatasetID, nil
}
select {
case <-ctx.Done():
return "", "", ctx.Err()
case <-time.After(pollEvery):
}
}
}
func (c *Client) fetchDataset(ctx context.Context, datasetID string) ([]json.RawMessage, error) {
if datasetID == "" {
return nil, errors.New("missing dataset id")
}
dsURL := fmt.Sprintf("%s/datasets/%s/items?clean=true&format=json&token=%s", apiBase, url.PathEscape(datasetID), url.QueryEscape(c.APIKey))
req, err := http.NewRequestWithContext(ctx, http.MethodGet, dsURL, nil)
if err != nil {
return nil, err
}
resp, err := c.HTTP.Do(req)
if err != nil {
return nil, fmt.Errorf("fetch dataset: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
b, _ := io.ReadAll(io.LimitReader(resp.Body, 1024))
return nil, fmt.Errorf("dataset returned %d: %s", resp.StatusCode, string(b))
}
var items []json.RawMessage
if err := json.NewDecoder(resp.Body).Decode(&items); err != nil {
return nil, fmt.Errorf("decode dataset: %w", err)
}
return items, nil
}
func decodeJSON(resp *http.Response, dst any) error {
defer resp.Body.Close()
if resp.StatusCode >= 300 {
b, _ := io.ReadAll(io.LimitReader(resp.Body, 1024))
return fmt.Errorf("http %d: %s", resp.StatusCode, string(b))
}
return json.NewDecoder(resp.Body).Decode(dst)
}

313
internal/apify/types.go Normal file
View File

@@ -0,0 +1,313 @@
package apify
import (
"encoding/json"
"strconv"
"strings"
)
// ActiveListingInput is the input schema for `automation-lab/ebay-scraper`.
// The actor accepts keyword searches and standard filters; it targets
// ebay.com only (no per-marketplace routing in the actor itself), so
// non-US marketplaces won't return useful results with this actor.
type ActiveListingInput struct {
SearchQueries []string `json:"searchQueries"`
MaxProductsPerSearch int `json:"maxProductsPerSearch,omitempty"`
MaxSearchPages int `json:"maxSearchPages,omitempty"`
Sort string `json:"sort,omitempty"`
ListingType string `json:"listingType,omitempty"`
Condition []string `json:"condition,omitempty"`
MinPrice *int `json:"minPrice,omitempty"`
MaxPrice *int `json:"maxPrice,omitempty"`
ProxyConfiguration *ProxyConfiguration `json:"proxyConfiguration,omitempty"`
}
// ProxyConfiguration is the standard apify input block for proxy routing.
// eBay (and most retail sites) return 403 to datacenter IPs; passing
// {"useApifyProxy": true, "apifyProxyGroups": ["RESIDENTIAL"]} works.
type ProxyConfiguration struct {
UseApifyProxy bool `json:"useApifyProxy"`
ApifyProxyGroups []string `json:"apifyProxyGroups,omitempty"`
ApifyProxyCountry string `json:"apifyProxyCountry,omitempty"`
}
// ActiveListingResult is decoded leniently to handle multiple eBay-scraper
// actors. delicious_zebu/ebay-product-listing-scraper returns productUrl /
// imageUrl / numeric price; harvestlab/ebay-scraper used url / price /
// currency. The decoder coalesces both shapes.
type ActiveListingResult struct {
Title string `json:"title"`
Price any `json:"price"`
OriginalPrice any `json:"originalPrice"`
Currency string `json:"currency"`
URL string `json:"url"`
ProductURL string `json:"productUrl"`
Store string `json:"store"`
ImageURL string `json:"imageUrl"`
Image string `json:"image"`
Thumbnail string `json:"thumbnail"`
Images []string `json:"images"`
Condition string `json:"condition"`
ListingType string `json:"listingType"`
ShippingCost any `json:"shippingCost"`
ShippingPrice any `json:"shippingPrice"`
FreeShipping bool `json:"freeShipping"`
Marketplace string `json:"marketplace"`
MatchConfidence float64 `json:"matchConfidence"`
Availability string `json:"availability"`
WatchersCount int `json:"watchersCount"`
QuantitySold int `json:"quantitySold"`
}
type SoldListingInput struct {
Query string `json:"query"`
Marketplace string `json:"marketplace,omitempty"`
MaxResults int `json:"maxResults,omitempty"`
DaysBack int `json:"daysBack,omitempty"`
ProxyConfiguration *ProxyConfiguration `json:"proxyConfiguration,omitempty"`
}
type SoldListingResult struct {
Title string `json:"title"`
SoldPrice float64 `json:"soldPrice"`
Currency string `json:"soldCurrency"`
SoldAt string `json:"endedAt"`
Condition string `json:"condition"`
ListingType string `json:"listingType"`
ShippingPrice float64 `json:"shippingPrice"`
URL string `json:"url"`
}
type PriceComparisonInput struct {
Query string `json:"query,omitempty"`
URL string `json:"url,omitempty"`
MatchStrictness string `json:"matchStrictness,omitempty"`
ProxyConfiguration *ProxyConfiguration `json:"proxyConfiguration,omitempty"`
}
type PriceComparisonResult struct {
Title string `json:"title"`
Price float64 `json:"price"`
Currency string `json:"currency"`
URL string `json:"url"`
Store string `json:"store"`
ImageURL string `json:"imageUrl"`
Availability string `json:"availability"`
MatchConfidence float64 `json:"matchConfidence"`
}
// YahooAuctionsJPInput targets meron1122/zenmarket-scraper. ZenMarket is a
// buyer-proxy for Yahoo Auctions JP; its scraper returns ZenMarket-proxied
// listing URLs and USD-converted prices.
type YahooAuctionsJPInput struct {
SearchTerm string `json:"searchTerm"`
CategoryID string `json:"categoryID,omitempty"`
MaxPages int `json:"maxPages,omitempty"`
MaxRemainingHours int `json:"maxRemainingHours,omitempty"`
}
// MercariJPInput targets cloud9_ai/mercari-scraper. The actor manages its
// own proxy (Japan datacenter with residential fallback), so we do not send
// a proxyConfiguration block.
type MercariJPInput struct {
SearchKeywords []string `json:"searchKeywords,omitempty"`
ProductUrls []string `json:"productUrls,omitempty"`
Status string `json:"status,omitempty"`
SortBy string `json:"sortBy,omitempty"`
PriceMin *int `json:"priceMin,omitempty"`
PriceMax *int `json:"priceMax,omitempty"`
ItemCondition string `json:"itemCondition,omitempty"`
MaxResults int `json:"maxResults,omitempty"`
}
// YahooAuctionsJPResult matches meron1122/zenmarket-scraper output. Prices
// are USD-converted at the ZenMarket-published rate.
type YahooAuctionsJPResult struct {
Name string `json:"name"`
CurrentPrice any `json:"current_price"`
Photos []string `json:"photos"`
URL string `json:"url"`
EndingDate string `json:"ending_date"`
}
// UnifiedResult is the common shape produced by ParseResults regardless of
// which actor type returned the data. The scheduler consumes this.
type UnifiedResult struct {
Title string
Price float64
Currency string
URL string
Store string
ImageURL string
Source string
MatchConfidence float64
OutOfStock bool
// MatchedQuery records which alias from the item's query list produced
// this row. Empty for URL-only items or rows from non-search sources.
MatchedQuery string
}
// Decode unmarshals a list of raw JSON items into UnifiedResult slices using
// the shape that matches the given source label.
func Decode(items []json.RawMessage, source string) ([]UnifiedResult, error) {
out := make([]UnifiedResult, 0, len(items))
switch source {
case SourceActiveEbay, SourcePriceCompare:
for _, raw := range items {
var r ActiveListingResult
if err := json.Unmarshal(raw, &r); err != nil {
continue
}
url := r.URL
if url == "" {
url = r.ProductURL
}
img := r.ImageURL
if img == "" {
img = r.Image
}
if img == "" {
img = r.Thumbnail
}
if img == "" && len(r.Images) > 0 {
img = r.Images[0]
}
store := r.Store
if store == "" {
store = r.Marketplace
}
if store == "" && source == SourceActiveEbay {
store = "ebay"
}
cur := r.Currency
if cur == "" {
cur = "USD"
}
out = append(out, UnifiedResult{
Title: r.Title,
Price: coercePrice(r.Price),
Currency: cur,
URL: url,
Store: store,
ImageURL: img,
Source: source,
MatchConfidence: r.MatchConfidence,
OutOfStock: isOOS(r.Availability),
})
}
case SourceYahooJP:
for _, raw := range items {
var r YahooAuctionsJPResult
if err := json.Unmarshal(raw, &r); err != nil {
continue
}
img := ""
if len(r.Photos) > 0 {
img = r.Photos[0]
}
out = append(out, UnifiedResult{
Title: r.Name,
Price: coercePrice(r.CurrentPrice),
Currency: "USD",
URL: r.URL,
Store: "yahoo-auctions-jp (via zenmarket)",
ImageURL: img,
Source: source,
})
}
case SourceMercariJP:
// Mercari actors vary in shape; accept either price/currentPrice and title/name.
for _, raw := range items {
var generic struct {
Title string `json:"title"`
Name string `json:"name"`
Price float64 `json:"price"`
CurrentPrice float64 `json:"currentPrice"`
Currency string `json:"currency"`
URL string `json:"url"`
ImageURL string `json:"imageUrl"`
Image string `json:"image"`
Status string `json:"status"`
}
if err := json.Unmarshal(raw, &generic); err != nil {
continue
}
title := generic.Title
if title == "" {
title = generic.Name
}
price := generic.Price
if price == 0 {
price = generic.CurrentPrice
}
img := generic.ImageURL
if img == "" {
img = generic.Image
}
cur := generic.Currency
if cur == "" {
cur = "JPY"
}
out = append(out, UnifiedResult{
Title: title,
Price: price,
Currency: cur,
URL: generic.URL,
Store: "mercari-jp",
ImageURL: img,
Source: source,
OutOfStock: isOOS(generic.Status),
})
}
}
return out, nil
}
const (
SourceActiveEbay = "ebay"
SourcePriceCompare = "price-comparison"
SourceYahooJP = "yahoo-auctions-jp"
SourceMercariJP = "mercari-jp"
SourceSoldEbay = "ebay-sold"
SourceSoldYahooJP = "yahoo-auctions-jp-sold"
)
// coercePrice accepts a price field that might be a number or a string with
// currency symbols / commas (e.g. "$24.99", "1,299.00"). Returns 0 on failure
// so FilterResults can drop the row cleanly.
func coercePrice(v any) float64 {
switch x := v.(type) {
case nil:
return 0
case float64:
return x
case float32:
return float64(x)
case int:
return float64(x)
case int64:
return float64(x)
case string:
s := strings.Map(func(r rune) rune {
switch {
case r >= '0' && r <= '9', r == '.', r == '-':
return r
}
return -1
}, x)
f, err := strconv.ParseFloat(s, 64)
if err != nil {
return 0
}
return f
}
return 0
}
func isOOS(s string) bool {
switch s {
case "out_of_stock", "OUT_OF_STOCK", "sold", "SOLD", "ended":
return true
}
return false
}