Files
veola/internal/apify/client.go
2026-05-13 19:42:49 -07:00

153 lines
4.1 KiB
Go

package apify
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"strings"
"time"
)
const (
apiBase = "https://api.apify.com/v2"
pollEvery = 3 * time.Second
pollTimeout = 5 * time.Minute
)
// Client is a thin wrapper around the Apify run-and-fetch lifecycle.
type Client struct {
APIKey string
HTTP *http.Client
}
func New(apiKey string) *Client {
return &Client{
APIKey: apiKey,
HTTP: &http.Client{Timeout: 30 * time.Second},
}
}
type runResponse struct {
Data struct {
ID string `json:"id"`
Status string `json:"status"`
DefaultDatasetID string `json:"defaultDatasetId"`
} `json:"data"`
}
// Run starts an actor run, waits for SUCCEEDED, and returns dataset items as raw JSON.
func (c *Client) Run(ctx context.Context, actorID string, input any) ([]json.RawMessage, error) {
if c.APIKey == "" {
return nil, errors.New("apify api_key not configured")
}
if actorID == "" {
return nil, errors.New("apify actor id is empty")
}
body, err := json.Marshal(input)
if err != nil {
return nil, err
}
// Apify URLs use "~" to separate username and actor name, never "/".
// Accept either form in config and normalize before path-escaping.
urlActorID := strings.ReplaceAll(actorID, "/", "~")
startURL := fmt.Sprintf("%s/acts/%s/runs?token=%s", apiBase, url.PathEscape(urlActorID), url.QueryEscape(c.APIKey))
req, err := http.NewRequestWithContext(ctx, http.MethodPost, startURL, bytes.NewReader(body))
if err != nil {
return nil, err
}
req.Header.Set("Content-Type", "application/json")
resp, err := c.HTTP.Do(req)
if err != nil {
return nil, fmt.Errorf("start run: %w", err)
}
var runResp runResponse
if err := decodeJSON(resp, &runResp); err != nil {
return nil, fmt.Errorf("start run: %w", err)
}
if runResp.Data.ID == "" {
return nil, errors.New("start run: missing run id")
}
deadline := time.Now().Add(pollTimeout)
pollCtx, cancel := context.WithDeadline(ctx, deadline)
defer cancel()
status, datasetID, err := c.waitForRun(pollCtx, runResp.Data.ID)
if err != nil {
return nil, err
}
if status != "SUCCEEDED" {
return nil, fmt.Errorf("apify run terminated with status %s", status)
}
return c.fetchDataset(ctx, datasetID)
}
func (c *Client) waitForRun(ctx context.Context, runID string) (string, string, error) {
pollURL := fmt.Sprintf("%s/actor-runs/%s?token=%s", apiBase, url.PathEscape(runID), url.QueryEscape(c.APIKey))
for {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, pollURL, nil)
if err != nil {
return "", "", err
}
resp, err := c.HTTP.Do(req)
if err != nil {
return "", "", fmt.Errorf("poll run: %w", err)
}
var r runResponse
if err := decodeJSON(resp, &r); err != nil {
return "", "", fmt.Errorf("poll run: %w", err)
}
switch r.Data.Status {
case "SUCCEEDED", "FAILED", "ABORTED", "TIMED-OUT":
return r.Data.Status, r.Data.DefaultDatasetID, nil
}
select {
case <-ctx.Done():
return "", "", ctx.Err()
case <-time.After(pollEvery):
}
}
}
func (c *Client) fetchDataset(ctx context.Context, datasetID string) ([]json.RawMessage, error) {
if datasetID == "" {
return nil, errors.New("missing dataset id")
}
dsURL := fmt.Sprintf("%s/datasets/%s/items?clean=true&format=json&token=%s", apiBase, url.PathEscape(datasetID), url.QueryEscape(c.APIKey))
req, err := http.NewRequestWithContext(ctx, http.MethodGet, dsURL, nil)
if err != nil {
return nil, err
}
resp, err := c.HTTP.Do(req)
if err != nil {
return nil, fmt.Errorf("fetch dataset: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
b, _ := io.ReadAll(io.LimitReader(resp.Body, 1024))
return nil, fmt.Errorf("dataset returned %d: %s", resp.StatusCode, string(b))
}
var items []json.RawMessage
if err := json.NewDecoder(resp.Body).Decode(&items); err != nil {
return nil, fmt.Errorf("decode dataset: %w", err)
}
return items, nil
}
func decodeJSON(resp *http.Response, dst any) error {
defer resp.Body.Close()
if resp.StatusCode >= 300 {
b, _ := io.ReadAll(io.LimitReader(resp.Body, 1024))
return fmt.Errorf("http %d: %s", resp.StatusCode, string(b))
}
return json.NewDecoder(resp.Body).Decode(dst)
}