diff options
| author | nsfisis <nsfisis@gmail.com> | 2026-02-14 12:20:31 +0900 |
|---|---|---|
| committer | nsfisis <nsfisis@gmail.com> | 2026-02-14 12:20:31 +0900 |
| commit | 042fcb5c4eac16f18fc051f55a6c63ca9e97306b (patch) | |
| tree | 1a61d1f7690e933a8d1e452e744ac02db14af042 | |
| parent | fffd36268a216044523c3f5227c3d375608c36dc (diff) | |
| download | feedaka-042fcb5c4eac16f18fc051f55a6c63ca9e97306b.tar.gz feedaka-042fcb5c4eac16f18fc051f55a6c63ca9e97306b.tar.zst feedaka-042fcb5c4eac16f18fc051f55a6c63ca9e97306b.zip | |
feat(feed): auto-discover feed URLs from HTML pages
When an HTML page is provided instead of a direct feed URL, parse
<link rel="alternate"> tags to find RSS/Atom feeds. Atom is preferred
over RSS when both are present.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
| -rw-r--r-- | backend/api/handler_feeds.go | 8 | ||||
| -rw-r--r-- | backend/cmd/serve.go | 4 | ||||
| -rw-r--r-- | backend/feed/discover.go | 94 | ||||
| -rw-r--r-- | backend/feed/discover_test.go | 129 | ||||
| -rw-r--r-- | backend/feed/feed.go | 63 | ||||
| -rw-r--r-- | backend/go.mod | 2 |
6 files changed, 289 insertions, 11 deletions
diff --git a/backend/api/handler_feeds.go b/backend/api/handler_feeds.go index f0a8785..4d16e4b 100644 --- a/backend/api/handler_feeds.go +++ b/backend/api/handler_feeds.go @@ -46,14 +46,14 @@ func (h *Handler) FeedsAddFeed(ctx context.Context, request FeedsAddFeedRequestO return nil, fmt.Errorf("authentication required") } - f, err := feed.Fetch(ctx, request.Body.Url) + result, err := feed.Fetch(ctx, request.Body.Url) if err != nil { return FeedsAddFeed400JSONResponse{Message: fmt.Sprintf("failed to parse feed: %v", err)}, nil } dbFeed, err := h.Queries.CreateFeed(ctx, db.CreateFeedParams{ - Url: request.Body.Url, - Title: f.Title, + Url: result.URL, + Title: result.Feed.Title, FetchedAt: time.Now().UTC().Format(time.RFC3339), UserID: userID, }) @@ -61,7 +61,7 @@ func (h *Handler) FeedsAddFeed(ctx context.Context, request FeedsAddFeedRequestO return FeedsAddFeed400JSONResponse{Message: fmt.Sprintf("failed to insert feed: %v", err)}, nil } - if err := feed.Sync(ctx, h.Queries, dbFeed.ID, f); err != nil { + if err := feed.Sync(ctx, h.Queries, dbFeed.ID, result.Feed); err != nil { return FeedsAddFeed400JSONResponse{Message: fmt.Sprintf("failed to sync articles: %v", err)}, nil } diff --git a/backend/cmd/serve.go b/backend/cmd/serve.go index c65a19f..4b32868 100644 --- a/backend/cmd/serve.go +++ b/backend/cmd/serve.go @@ -25,11 +25,11 @@ import ( func fetchOneFeed(feedID int64, url string, ctx context.Context, queries *db.Queries) error { log.Printf("Fetching %s...\n", url) - f, err := feed.Fetch(ctx, url) + result, err := feed.Fetch(ctx, url) if err != nil { return err } - return feed.Sync(ctx, queries, feedID, f) + return feed.Sync(ctx, queries, feedID, result.Feed) } func listFeedsToBeFetched(ctx context.Context, queries *db.Queries) (map[int64]string, error) { diff --git a/backend/feed/discover.go b/backend/feed/discover.go new file mode 100644 index 0000000..962cbcd --- /dev/null +++ b/backend/feed/discover.go @@ -0,0 +1,94 @@ +package feed + +import ( + "io" + "net/url" + "strings" + + "golang.org/x/net/html" +) + +type feedLink struct { + URL string + Type string // "atom" or "rss" +} + +// discoverFeeds parses an HTML document and extracts feed URLs from +// <link rel="alternate"> tags. It resolves relative URLs against baseURL. +func discoverFeeds(body io.Reader, baseURL *url.URL) []feedLink { + var links []feedLink + z := html.NewTokenizer(body) + for { + tt := z.Next() + switch tt { + case html.ErrorToken: + return links + case html.StartTagToken, html.SelfClosingTagToken: + tn, _ := z.TagName() + tagName := string(tn) + + if tagName == "body" { + return links + } + if tagName != "link" { + continue + } + + attrs := tokenAttrs(z) + rel := strings.ToLower(attrs["rel"]) + typ := strings.ToLower(attrs["type"]) + href := attrs["href"] + + if rel != "alternate" || href == "" { + continue + } + + var feedType string + switch typ { + case "application/atom+xml": + feedType = "atom" + case "application/rss+xml": + feedType = "rss" + default: + continue + } + + ref, err := url.Parse(href) + if err != nil { + continue + } + resolved := baseURL.ResolveReference(ref).String() + links = append(links, feedLink{URL: resolved, Type: feedType}) + } + } +} + +// selectFeed picks the best feed URL from discovered links. +// Prefers Atom over RSS. +func selectFeed(links []feedLink) string { + for _, l := range links { + if l.Type == "atom" { + return l.URL + } + } + for _, l := range links { + if l.Type == "rss" { + return l.URL + } + } + return "" +} + +func tokenAttrs(z *html.Tokenizer) map[string]string { + attrs := make(map[string]string) + for { + key, val, more := z.TagAttr() + if len(key) > 0 { + attrs[string(key)] = string(val) + } + if !more { + break + } + } + return attrs +} diff --git a/backend/feed/discover_test.go b/backend/feed/discover_test.go new file mode 100644 index 0000000..4a1315e --- /dev/null +++ b/backend/feed/discover_test.go @@ -0,0 +1,129 @@ +package feed + +import ( + "net/url" + "strings" + "testing" +) + +func mustParseURL(s string) *url.URL { + u, err := url.Parse(s) + if err != nil { + panic(err) + } + return u +} + +func TestDiscoverFeeds_AtomAndRSS(t *testing.T) { + html := `<!DOCTYPE html> +<html> +<head> + <link rel="alternate" type="application/atom+xml" href="/feed.atom" title="Atom Feed"> + <link rel="alternate" type="application/rss+xml" href="/feed.rss" title="RSS Feed"> +</head> +<body></body> +</html>` + + base := mustParseURL("https://example.com/blog") + links := discoverFeeds(strings.NewReader(html), base) + + if len(links) != 2 { + t.Fatalf("expected 2 links, got %d", len(links)) + } + if links[0].URL != "https://example.com/feed.atom" || links[0].Type != "atom" { + t.Errorf("unexpected first link: %+v", links[0]) + } + if links[1].URL != "https://example.com/feed.rss" || links[1].Type != "rss" { + t.Errorf("unexpected second link: %+v", links[1]) + } +} + +func TestDiscoverFeeds_AbsoluteURL(t *testing.T) { + html := `<html><head> + <link rel="alternate" type="application/atom+xml" href="https://other.com/feed.xml"> +</head><body></body></html>` + + base := mustParseURL("https://example.com") + links := discoverFeeds(strings.NewReader(html), base) + + if len(links) != 1 { + t.Fatalf("expected 1 link, got %d", len(links)) + } + if links[0].URL != "https://other.com/feed.xml" { + t.Errorf("expected absolute URL preserved, got %s", links[0].URL) + } +} + +func TestDiscoverFeeds_NoFeeds(t *testing.T) { + html := `<html><head><title>No feeds</title></head><body></body></html>` + links := discoverFeeds(strings.NewReader(html), mustParseURL("https://example.com")) + if len(links) != 0 { + t.Fatalf("expected 0 links, got %d", len(links)) + } +} + +func TestDiscoverFeeds_IgnoresNonAlternate(t *testing.T) { + html := `<html><head> + <link rel="stylesheet" type="text/css" href="/style.css"> + <link rel="alternate" type="application/atom+xml" href="/feed.atom"> +</head><body></body></html>` + + links := discoverFeeds(strings.NewReader(html), mustParseURL("https://example.com")) + if len(links) != 1 { + t.Fatalf("expected 1 link, got %d", len(links)) + } +} + +func TestDiscoverFeeds_IgnoresUnknownTypes(t *testing.T) { + html := `<html><head> + <link rel="alternate" type="application/json" href="/feed.json"> + <link rel="alternate" type="application/rss+xml" href="/feed.rss"> +</head><body></body></html>` + + links := discoverFeeds(strings.NewReader(html), mustParseURL("https://example.com")) + if len(links) != 1 { + t.Fatalf("expected 1 link, got %d", len(links)) + } + if links[0].Type != "rss" { + t.Errorf("expected rss, got %s", links[0].Type) + } +} + +func TestDiscoverFeeds_StopsAtBody(t *testing.T) { + html := `<html><head></head><body> + <link rel="alternate" type="application/atom+xml" href="/feed.atom"> +</body></html>` + + links := discoverFeeds(strings.NewReader(html), mustParseURL("https://example.com")) + if len(links) != 0 { + t.Fatalf("expected 0 links (should stop at body), got %d", len(links)) + } +} + +func TestSelectFeed_PrefersAtom(t *testing.T) { + links := []feedLink{ + {URL: "https://example.com/rss", Type: "rss"}, + {URL: "https://example.com/atom", Type: "atom"}, + } + got := selectFeed(links) + if got != "https://example.com/atom" { + t.Errorf("expected Atom URL, got %s", got) + } +} + +func TestSelectFeed_FallsBackToRSS(t *testing.T) { + links := []feedLink{ + {URL: "https://example.com/rss", Type: "rss"}, + } + got := selectFeed(links) + if got != "https://example.com/rss" { + t.Errorf("expected RSS URL, got %s", got) + } +} + +func TestSelectFeed_EmptyList(t *testing.T) { + got := selectFeed(nil) + if got != "" { + t.Errorf("expected empty string, got %s", got) + } +} diff --git a/backend/feed/feed.go b/backend/feed/feed.go index 4349d1e..2d84798 100644 --- a/backend/feed/feed.go +++ b/backend/feed/feed.go @@ -2,7 +2,10 @@ package feed import ( "context" + "errors" "fmt" + "net/http" + "net/url" "time" "github.com/mmcdole/gofeed" @@ -10,15 +13,67 @@ import ( "undef.ninja/x/feedaka/db" ) -func Fetch(ctx context.Context, url string) (*gofeed.Feed, error) { +// FetchResult holds the result of fetching a feed, including the resolved URL. +type FetchResult struct { + Feed *gofeed.Feed + URL string +} + +func Fetch(ctx context.Context, rawURL string) (*FetchResult, error) { fp := gofeed.NewParser() ctx, cancel := context.WithTimeout(ctx, 10*time.Second) defer cancel() - feed, err := fp.ParseURLWithContext(url, ctx) + + f, err := fp.ParseURLWithContext(rawURL, ctx) + if err == nil { + return &FetchResult{Feed: f, URL: rawURL}, nil + } + + if !errors.Is(err, gofeed.ErrFeedTypeNotDetected) { + return nil, fmt.Errorf("failed to fetch %s: %w", rawURL, err) + } + + discoveredURL, discoverErr := discoverFeedURL(ctx, rawURL) + if discoverErr != nil { + return nil, fmt.Errorf("failed to fetch %s: not a feed and auto-discovery failed: %w", rawURL, discoverErr) + } + + f, err = fp.ParseURLWithContext(discoveredURL, ctx) if err != nil { - return nil, fmt.Errorf("failed to fetch %s: %w", url, err) + return nil, fmt.Errorf("failed to fetch discovered feed %s: %w", discoveredURL, err) } - return feed, nil + + return &FetchResult{Feed: f, URL: discoveredURL}, nil +} + +func discoverFeedURL(ctx context.Context, rawURL string) (string, error) { + req, err := http.NewRequestWithContext(ctx, "GET", rawURL, nil) + if err != nil { + return "", err + } + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return "", err + } + defer resp.Body.Close() + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return "", fmt.Errorf("HTTP %d", resp.StatusCode) + } + + base, err := url.Parse(rawURL) + if err != nil { + return "", err + } + + links := discoverFeeds(resp.Body, base) + feedURL := selectFeed(links) + if feedURL == "" { + return "", fmt.Errorf("no feed links found in HTML") + } + + return feedURL, nil } func Sync(ctx context.Context, queries *db.Queries, feedID int64, f *gofeed.Feed) error { diff --git a/backend/go.mod b/backend/go.mod index d8a84eb..cb62854 100644 --- a/backend/go.mod +++ b/backend/go.mod @@ -13,6 +13,7 @@ require ( github.com/mmcdole/gofeed v1.3.0 github.com/oapi-codegen/runtime v1.1.2 golang.org/x/crypto v0.39.0 + golang.org/x/net v0.41.0 ) require ( @@ -82,7 +83,6 @@ require ( go.uber.org/zap v1.27.0 // indirect golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b // indirect golang.org/x/mod v0.25.0 // indirect - golang.org/x/net v0.41.0 // indirect golang.org/x/sync v0.15.0 // indirect golang.org/x/sys v0.33.0 // indirect golang.org/x/text v0.26.0 // indirect |
