summaryrefslogtreecommitdiffstats
path: root/modules/indexer/code
diff options
context:
space:
mode:
authorDaniel Baumann <daniel@debian.org>2024-10-18 20:33:49 +0200
committerDaniel Baumann <daniel@debian.org>2024-10-18 20:33:49 +0200
commitdd136858f1ea40ad3c94191d647487fa4f31926c (patch)
tree58fec94a7b2a12510c9664b21793f1ed560c6518 /modules/indexer/code
parentInitial commit. (diff)
downloadforgejo-dd136858f1ea40ad3c94191d647487fa4f31926c.tar.xz
forgejo-dd136858f1ea40ad3c94191d647487fa4f31926c.zip
Adding upstream version 9.0.0.
Signed-off-by: Daniel Baumann <daniel@debian.org>
Diffstat (limited to 'modules/indexer/code')
-rw-r--r--modules/indexer/code/bleve/bleve.go354
-rw-r--r--modules/indexer/code/elasticsearch/elasticsearch.go388
-rw-r--r--modules/indexer/code/elasticsearch/elasticsearch_test.go16
-rw-r--r--modules/indexer/code/git.go199
-rw-r--r--modules/indexer/code/indexer.go310
-rw-r--r--modules/indexer/code/indexer_test.go145
-rw-r--r--modules/indexer/code/internal/indexer.go54
-rw-r--r--modules/indexer/code/internal/model.go44
-rw-r--r--modules/indexer/code/internal/util.go32
-rw-r--r--modules/indexer/code/search.go228
10 files changed, 1770 insertions, 0 deletions
diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go
new file mode 100644
index 0000000..cf9fcbd
--- /dev/null
+++ b/modules/indexer/code/bleve/bleve.go
@@ -0,0 +1,354 @@
+// Copyright 2019 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package bleve
+
+import (
+ "bufio"
+ "context"
+ "fmt"
+ "io"
+ "strconv"
+ "strings"
+ "time"
+
+ repo_model "code.gitea.io/gitea/models/repo"
+ "code.gitea.io/gitea/modules/analyze"
+ "code.gitea.io/gitea/modules/charset"
+ "code.gitea.io/gitea/modules/git"
+ "code.gitea.io/gitea/modules/gitrepo"
+ "code.gitea.io/gitea/modules/indexer/code/internal"
+ indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
+ inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve"
+ "code.gitea.io/gitea/modules/setting"
+ "code.gitea.io/gitea/modules/timeutil"
+ "code.gitea.io/gitea/modules/typesniffer"
+
+ "github.com/blevesearch/bleve/v2"
+ analyzer_custom "github.com/blevesearch/bleve/v2/analysis/analyzer/custom"
+ analyzer_keyword "github.com/blevesearch/bleve/v2/analysis/analyzer/keyword"
+ "github.com/blevesearch/bleve/v2/analysis/token/camelcase"
+ "github.com/blevesearch/bleve/v2/analysis/token/lowercase"
+ "github.com/blevesearch/bleve/v2/analysis/token/unicodenorm"
+ "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
+ "github.com/blevesearch/bleve/v2/mapping"
+ "github.com/blevesearch/bleve/v2/search/query"
+ "github.com/go-enry/go-enry/v2"
+)
+
+const (
+ unicodeNormalizeName = "unicodeNormalize"
+ maxBatchSize = 16
+ // fuzzyDenominator determines the levenshtein distance per each character of a keyword
+ fuzzyDenominator = 4
+ // see https://github.com/blevesearch/bleve/issues/1563#issuecomment-786822311
+ maxFuzziness = 2
+)
+
+func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
+ return m.AddCustomTokenFilter(unicodeNormalizeName, map[string]any{
+ "type": unicodenorm.Name,
+ "form": unicodenorm.NFC,
+ })
+}
+
+// RepoIndexerData data stored in the repo indexer
+type RepoIndexerData struct {
+ RepoID int64
+ CommitID string
+ Content string
+ Language string
+ UpdatedAt time.Time
+}
+
+// Type returns the document type, for bleve's mapping.Classifier interface.
+func (d *RepoIndexerData) Type() string {
+ return repoIndexerDocType
+}
+
+const (
+ repoIndexerAnalyzer = "repoIndexerAnalyzer"
+ repoIndexerDocType = "repoIndexerDocType"
+ repoIndexerLatestVersion = 6
+)
+
+// generateBleveIndexMapping generates a bleve index mapping for the repo indexer
+func generateBleveIndexMapping() (mapping.IndexMapping, error) {
+ docMapping := bleve.NewDocumentMapping()
+ numericFieldMapping := bleve.NewNumericFieldMapping()
+ numericFieldMapping.IncludeInAll = false
+ docMapping.AddFieldMappingsAt("RepoID", numericFieldMapping)
+
+ textFieldMapping := bleve.NewTextFieldMapping()
+ textFieldMapping.IncludeInAll = false
+ docMapping.AddFieldMappingsAt("Content", textFieldMapping)
+
+ termFieldMapping := bleve.NewTextFieldMapping()
+ termFieldMapping.IncludeInAll = false
+ termFieldMapping.Analyzer = analyzer_keyword.Name
+ docMapping.AddFieldMappingsAt("Language", termFieldMapping)
+ docMapping.AddFieldMappingsAt("CommitID", termFieldMapping)
+
+ timeFieldMapping := bleve.NewDateTimeFieldMapping()
+ timeFieldMapping.IncludeInAll = false
+ docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)
+
+ mapping := bleve.NewIndexMapping()
+ if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
+ return nil, err
+ } else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{
+ "type": analyzer_custom.Name,
+ "char_filters": []string{},
+ "tokenizer": unicode.Name,
+ "token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name},
+ }); err != nil {
+ return nil, err
+ }
+ mapping.DefaultAnalyzer = repoIndexerAnalyzer
+ mapping.AddDocumentMapping(repoIndexerDocType, docMapping)
+ mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
+
+ return mapping, nil
+}
+
+var _ internal.Indexer = &Indexer{}
+
+// Indexer represents a bleve indexer implementation
+type Indexer struct {
+ inner *inner_bleve.Indexer
+ indexer_internal.Indexer // do not composite inner_bleve.Indexer directly to avoid exposing too much
+}
+
+// NewIndexer creates a new bleve local indexer
+func NewIndexer(indexDir string) *Indexer {
+ inner := inner_bleve.NewIndexer(indexDir, repoIndexerLatestVersion, generateBleveIndexMapping)
+ return &Indexer{
+ Indexer: inner,
+ inner: inner,
+ }
+}
+
+func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, commitSha string,
+ update internal.FileUpdate, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch,
+) error {
+ // Ignore vendored files in code search
+ if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
+ return nil
+ }
+
+ size := update.Size
+
+ var err error
+ if !update.Sized {
+ var stdout string
+ stdout, _, err = git.NewCommand(ctx, "cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(&git.RunOpts{Dir: repo.RepoPath()})
+ if err != nil {
+ return err
+ }
+ if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil {
+ return fmt.Errorf("misformatted git cat-file output: %w", err)
+ }
+ }
+
+ if size > setting.Indexer.MaxIndexerFileSize {
+ return b.addDelete(update.Filename, repo, batch)
+ }
+
+ if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil {
+ return err
+ }
+
+ _, _, size, err = git.ReadBatchLine(batchReader)
+ if err != nil {
+ return err
+ }
+
+ fileContents, err := io.ReadAll(io.LimitReader(batchReader, size))
+ if err != nil {
+ return err
+ } else if !typesniffer.DetectContentType(fileContents).IsText() {
+ // FIXME: UTF-16 files will probably fail here
+ return nil
+ }
+
+ if _, err = batchReader.Discard(1); err != nil {
+ return err
+ }
+ id := internal.FilenameIndexerID(repo.ID, update.Filename)
+ return batch.Index(id, &RepoIndexerData{
+ RepoID: repo.ID,
+ CommitID: commitSha,
+ Content: string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
+ Language: analyze.GetCodeLanguage(update.Filename, fileContents),
+ UpdatedAt: time.Now().UTC(),
+ })
+}
+
+func (b *Indexer) addDelete(filename string, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch) error {
+ id := internal.FilenameIndexerID(repo.ID, filename)
+ return batch.Delete(id)
+}
+
+// Index indexes the data
+func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error {
+ batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize)
+ if len(changes.Updates) > 0 {
+ r, err := gitrepo.OpenRepository(ctx, repo)
+ if err != nil {
+ return err
+ }
+ defer r.Close()
+ gitBatch, err := r.NewBatch(ctx)
+ if err != nil {
+ return err
+ }
+ defer gitBatch.Close()
+
+ for _, update := range changes.Updates {
+ if err := b.addUpdate(ctx, gitBatch.Writer, gitBatch.Reader, sha, update, repo, batch); err != nil {
+ return err
+ }
+ }
+ gitBatch.Close()
+ }
+ for _, filename := range changes.RemovedFilenames {
+ if err := b.addDelete(filename, repo, batch); err != nil {
+ return err
+ }
+ }
+ return batch.Flush()
+}
+
+// Delete deletes indexes by ids
+func (b *Indexer) Delete(_ context.Context, repoID int64) error {
+ query := inner_bleve.NumericEqualityQuery(repoID, "RepoID")
+ searchRequest := bleve.NewSearchRequestOptions(query, 2147483647, 0, false)
+ result, err := b.inner.Indexer.Search(searchRequest)
+ if err != nil {
+ return err
+ }
+ batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize)
+ for _, hit := range result.Hits {
+ if err = batch.Delete(hit.ID); err != nil {
+ return err
+ }
+ }
+ return batch.Flush()
+}
+
+// Search searches for files in the specified repo.
+// Returns the matching file-paths
+func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
+ var (
+ indexerQuery query.Query
+ keywordQuery query.Query
+ )
+
+ phraseQuery := bleve.NewMatchPhraseQuery(opts.Keyword)
+ phraseQuery.FieldVal = "Content"
+ phraseQuery.Analyzer = repoIndexerAnalyzer
+ keywordQuery = phraseQuery
+ if opts.IsKeywordFuzzy {
+ phraseQuery.Fuzziness = min(maxFuzziness, len(opts.Keyword)/fuzzyDenominator)
+ }
+
+ if len(opts.RepoIDs) > 0 {
+ repoQueries := make([]query.Query, 0, len(opts.RepoIDs))
+ for _, repoID := range opts.RepoIDs {
+ repoQueries = append(repoQueries, inner_bleve.NumericEqualityQuery(repoID, "RepoID"))
+ }
+
+ indexerQuery = bleve.NewConjunctionQuery(
+ bleve.NewDisjunctionQuery(repoQueries...),
+ keywordQuery,
+ )
+ } else {
+ indexerQuery = keywordQuery
+ }
+
+ // Save for reuse without language filter
+ facetQuery := indexerQuery
+ if len(opts.Language) > 0 {
+ languageQuery := bleve.NewMatchQuery(opts.Language)
+ languageQuery.FieldVal = "Language"
+ languageQuery.Analyzer = analyzer_keyword.Name
+
+ indexerQuery = bleve.NewConjunctionQuery(
+ indexerQuery,
+ languageQuery,
+ )
+ }
+
+ from, pageSize := opts.GetSkipTake()
+ searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
+ searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
+ searchRequest.IncludeLocations = true
+
+ if len(opts.Language) == 0 {
+ searchRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
+ }
+
+ result, err := b.inner.Indexer.SearchInContext(ctx, searchRequest)
+ if err != nil {
+ return 0, nil, nil, err
+ }
+
+ total := int64(result.Total)
+
+ searchResults := make([]*internal.SearchResult, len(result.Hits))
+ for i, hit := range result.Hits {
+ startIndex, endIndex := -1, -1
+ for _, locations := range hit.Locations["Content"] {
+ location := locations[0]
+ locationStart := int(location.Start)
+ locationEnd := int(location.End)
+ if startIndex < 0 || locationStart < startIndex {
+ startIndex = locationStart
+ }
+ if endIndex < 0 || locationEnd > endIndex {
+ endIndex = locationEnd
+ }
+ }
+ language := hit.Fields["Language"].(string)
+ var updatedUnix timeutil.TimeStamp
+ if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {
+ updatedUnix = timeutil.TimeStamp(t.Unix())
+ }
+ searchResults[i] = &internal.SearchResult{
+ RepoID: int64(hit.Fields["RepoID"].(float64)),
+ StartIndex: startIndex,
+ EndIndex: endIndex,
+ Filename: internal.FilenameOfIndexerID(hit.ID),
+ Content: hit.Fields["Content"].(string),
+ CommitID: hit.Fields["CommitID"].(string),
+ UpdatedUnix: updatedUnix,
+ Language: language,
+ Color: enry.GetColor(language),
+ }
+ }
+
+ searchResultLanguages := make([]*internal.SearchResultLanguages, 0, 10)
+ if len(opts.Language) > 0 {
+ // Use separate query to go get all language counts
+ facetRequest := bleve.NewSearchRequestOptions(facetQuery, 1, 0, false)
+ facetRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
+ facetRequest.IncludeLocations = true
+ facetRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
+
+ if result, err = b.inner.Indexer.Search(facetRequest); err != nil {
+ return 0, nil, nil, err
+ }
+ }
+ languagesFacet := result.Facets["languages"]
+ for _, term := range languagesFacet.Terms.Terms() {
+ if len(term.Term) == 0 {
+ continue
+ }
+ searchResultLanguages = append(searchResultLanguages, &internal.SearchResultLanguages{
+ Language: term.Term,
+ Color: enry.GetColor(term.Term),
+ Count: term.Count,
+ })
+ }
+ return total, searchResults, searchResultLanguages, nil
+}
diff --git a/modules/indexer/code/elasticsearch/elasticsearch.go b/modules/indexer/code/elasticsearch/elasticsearch.go
new file mode 100644
index 0000000..aee5668
--- /dev/null
+++ b/modules/indexer/code/elasticsearch/elasticsearch.go
@@ -0,0 +1,388 @@
+// Copyright 2020 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package elasticsearch
+
+import (
+ "bufio"
+ "context"
+ "fmt"
+ "io"
+ "strconv"
+ "strings"
+
+ repo_model "code.gitea.io/gitea/models/repo"
+ "code.gitea.io/gitea/modules/analyze"
+ "code.gitea.io/gitea/modules/charset"
+ "code.gitea.io/gitea/modules/git"
+ "code.gitea.io/gitea/modules/gitrepo"
+ "code.gitea.io/gitea/modules/indexer/code/internal"
+ indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
+ inner_elasticsearch "code.gitea.io/gitea/modules/indexer/internal/elasticsearch"
+ "code.gitea.io/gitea/modules/json"
+ "code.gitea.io/gitea/modules/log"
+ "code.gitea.io/gitea/modules/setting"
+ "code.gitea.io/gitea/modules/timeutil"
+ "code.gitea.io/gitea/modules/typesniffer"
+
+ "github.com/go-enry/go-enry/v2"
+ "github.com/olivere/elastic/v7"
+)
+
+const (
+ esRepoIndexerLatestVersion = 1
+ // multi-match-types, currently only 2 types are used
+ // Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
+ esMultiMatchTypeBestFields = "best_fields"
+ esMultiMatchTypePhrasePrefix = "phrase_prefix"
+)
+
+var _ internal.Indexer = &Indexer{}
+
+// Indexer implements Indexer interface
+type Indexer struct {
+ inner *inner_elasticsearch.Indexer
+ indexer_internal.Indexer // do not composite inner_elasticsearch.Indexer directly to avoid exposing too much
+}
+
+// NewIndexer creates a new elasticsearch indexer
+func NewIndexer(url, indexerName string) *Indexer {
+ inner := inner_elasticsearch.NewIndexer(url, indexerName, esRepoIndexerLatestVersion, defaultMapping)
+ indexer := &Indexer{
+ inner: inner,
+ Indexer: inner,
+ }
+ return indexer
+}
+
+const (
+ defaultMapping = `{
+ "mappings": {
+ "properties": {
+ "repo_id": {
+ "type": "long",
+ "index": true
+ },
+ "content": {
+ "type": "text",
+ "term_vector": "with_positions_offsets",
+ "index": true
+ },
+ "commit_id": {
+ "type": "keyword",
+ "index": true
+ },
+ "language": {
+ "type": "keyword",
+ "index": true
+ },
+ "updated_at": {
+ "type": "long",
+ "index": true
+ }
+ }
+ }
+ }`
+)
+
+func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, sha string, update internal.FileUpdate, repo *repo_model.Repository) ([]elastic.BulkableRequest, error) {
+ // Ignore vendored files in code search
+ if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
+ return nil, nil
+ }
+
+ size := update.Size
+ var err error
+ if !update.Sized {
+ var stdout string
+ stdout, _, err = git.NewCommand(ctx, "cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(&git.RunOpts{Dir: repo.RepoPath()})
+ if err != nil {
+ return nil, err
+ }
+ if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil {
+ return nil, fmt.Errorf("misformatted git cat-file output: %w", err)
+ }
+ }
+
+ if size > setting.Indexer.MaxIndexerFileSize {
+ return []elastic.BulkableRequest{b.addDelete(update.Filename, repo)}, nil
+ }
+
+ if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil {
+ return nil, err
+ }
+
+ _, _, size, err = git.ReadBatchLine(batchReader)
+ if err != nil {
+ return nil, err
+ }
+
+ fileContents, err := io.ReadAll(io.LimitReader(batchReader, size))
+ if err != nil {
+ return nil, err
+ } else if !typesniffer.DetectContentType(fileContents).IsText() {
+ // FIXME: UTF-16 files will probably fail here
+ return nil, nil
+ }
+
+ if _, err = batchReader.Discard(1); err != nil {
+ return nil, err
+ }
+ id := internal.FilenameIndexerID(repo.ID, update.Filename)
+
+ return []elastic.BulkableRequest{
+ elastic.NewBulkIndexRequest().
+ Index(b.inner.VersionedIndexName()).
+ Id(id).
+ Doc(map[string]any{
+ "repo_id": repo.ID,
+ "content": string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
+ "commit_id": sha,
+ "language": analyze.GetCodeLanguage(update.Filename, fileContents),
+ "updated_at": timeutil.TimeStampNow(),
+ }),
+ }, nil
+}
+
+func (b *Indexer) addDelete(filename string, repo *repo_model.Repository) elastic.BulkableRequest {
+ id := internal.FilenameIndexerID(repo.ID, filename)
+ return elastic.NewBulkDeleteRequest().
+ Index(b.inner.VersionedIndexName()).
+ Id(id)
+}
+
+// Index will save the index data
+func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error {
+ reqs := make([]elastic.BulkableRequest, 0)
+ if len(changes.Updates) > 0 {
+ r, err := gitrepo.OpenRepository(ctx, repo)
+ if err != nil {
+ return err
+ }
+ defer r.Close()
+ batch, err := r.NewBatch(ctx)
+ if err != nil {
+ return err
+ }
+ defer batch.Close()
+
+ for _, update := range changes.Updates {
+ updateReqs, err := b.addUpdate(ctx, batch.Writer, batch.Reader, sha, update, repo)
+ if err != nil {
+ return err
+ }
+ if len(updateReqs) > 0 {
+ reqs = append(reqs, updateReqs...)
+ }
+ }
+ batch.Close()
+ }
+
+ for _, filename := range changes.RemovedFilenames {
+ reqs = append(reqs, b.addDelete(filename, repo))
+ }
+
+ if len(reqs) > 0 {
+ esBatchSize := 50
+
+ for i := 0; i < len(reqs); i += esBatchSize {
+ _, err := b.inner.Client.Bulk().
+ Index(b.inner.VersionedIndexName()).
+ Add(reqs[i:min(i+esBatchSize, len(reqs))]...).
+ Do(ctx)
+ if err != nil {
+ return err
+ }
+ }
+ }
+ return nil
+}
+
+// Delete entries by repoId
+func (b *Indexer) Delete(ctx context.Context, repoID int64) error {
+ if err := b.doDelete(ctx, repoID); err != nil {
+ // Maybe there is a conflict during the delete operation, so we should retry after a refresh
+ log.Warn("Deletion of entries of repo %v within index %v was erroneus. Trying to refresh index before trying again", repoID, b.inner.VersionedIndexName(), err)
+ if err := b.refreshIndex(ctx); err != nil {
+ return err
+ }
+ if err := b.doDelete(ctx, repoID); err != nil {
+ log.Error("Could not delete entries of repo %v within index %v", repoID, b.inner.VersionedIndexName())
+ return err
+ }
+ }
+ return nil
+}
+
+func (b *Indexer) refreshIndex(ctx context.Context) error {
+ if _, err := b.inner.Client.Refresh(b.inner.VersionedIndexName()).Do(ctx); err != nil {
+ log.Error("Error while trying to refresh index %v", b.inner.VersionedIndexName(), err)
+ return err
+ }
+
+ return nil
+}
+
+// Delete entries by repoId
+func (b *Indexer) doDelete(ctx context.Context, repoID int64) error {
+ _, err := b.inner.Client.DeleteByQuery(b.inner.VersionedIndexName()).
+ Query(elastic.NewTermsQuery("repo_id", repoID)).
+ Do(ctx)
+ return err
+}
+
+// indexPos find words positions for start and the following end on content. It will
+// return the beginning position of the first start and the ending position of the
+// first end following the start string.
+// If not found any of the positions, it will return -1, -1.
+func indexPos(content, start, end string) (int, int) {
+ startIdx := strings.Index(content, start)
+ if startIdx < 0 {
+ return -1, -1
+ }
+ endIdx := strings.Index(content[startIdx+len(start):], end)
+ if endIdx < 0 {
+ return -1, -1
+ }
+ return startIdx, startIdx + len(start) + endIdx + len(end)
+}
+
+func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
+ hits := make([]*internal.SearchResult, 0, pageSize)
+ for _, hit := range searchResult.Hits.Hits {
+ // FIXME: There is no way to get the position the keyword on the content currently on the same request.
+ // So we get it from content, this may made the query slower. See
+ // https://discuss.elastic.co/t/fetching-position-of-keyword-in-matched-document/94291
+ var startIndex, endIndex int
+ c, ok := hit.Highlight["content"]
+ if ok && len(c) > 0 {
+ // FIXME: Since the highlighting content will include <em> and </em> for the keywords,
+ // now we should find the positions. But how to avoid html content which contains the
+ // <em> and </em> tags? If elastic search has handled that?
+ startIndex, endIndex = indexPos(c[0], "<em>", "</em>")
+ if startIndex == -1 {
+ panic(fmt.Sprintf("1===%s,,,%#v,,,%s", kw, hit.Highlight, c[0]))
+ }
+ } else {
+ panic(fmt.Sprintf("2===%#v", hit.Highlight))
+ }
+
+ repoID, fileName := internal.ParseIndexerID(hit.Id)
+ res := make(map[string]any)
+ if err := json.Unmarshal(hit.Source, &res); err != nil {
+ return 0, nil, nil, err
+ }
+
+ language := res["language"].(string)
+
+ hits = append(hits, &internal.SearchResult{
+ RepoID: repoID,
+ Filename: fileName,
+ CommitID: res["commit_id"].(string),
+ Content: res["content"].(string),
+ UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)),
+ Language: language,
+ StartIndex: startIndex,
+ EndIndex: endIndex - 9, // remove the length <em></em> since we give Content the original data
+ Color: enry.GetColor(language),
+ })
+ }
+
+ return searchResult.TotalHits(), hits, extractAggs(searchResult), nil
+}
+
+func extractAggs(searchResult *elastic.SearchResult) []*internal.SearchResultLanguages {
+ var searchResultLanguages []*internal.SearchResultLanguages
+ agg, found := searchResult.Aggregations.Terms("language")
+ if found {
+ searchResultLanguages = make([]*internal.SearchResultLanguages, 0, 10)
+
+ for _, bucket := range agg.Buckets {
+ searchResultLanguages = append(searchResultLanguages, &internal.SearchResultLanguages{
+ Language: bucket.Key.(string),
+ Color: enry.GetColor(bucket.Key.(string)),
+ Count: int(bucket.DocCount),
+ })
+ }
+ }
+ return searchResultLanguages
+}
+
+// Search searches for codes and language stats by given conditions.
+func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
+ searchType := esMultiMatchTypePhrasePrefix
+ if opts.IsKeywordFuzzy {
+ searchType = esMultiMatchTypeBestFields
+ }
+
+ kwQuery := elastic.NewMultiMatchQuery(opts.Keyword, "content").Type(searchType)
+ query := elastic.NewBoolQuery()
+ query = query.Must(kwQuery)
+ if len(opts.RepoIDs) > 0 {
+ repoStrs := make([]any, 0, len(opts.RepoIDs))
+ for _, repoID := range opts.RepoIDs {
+ repoStrs = append(repoStrs, repoID)
+ }
+ repoQuery := elastic.NewTermsQuery("repo_id", repoStrs...)
+ query = query.Must(repoQuery)
+ }
+
+ var (
+ start, pageSize = opts.GetSkipTake()
+ kw = "<em>" + opts.Keyword + "</em>"
+ aggregation = elastic.NewTermsAggregation().Field("language").Size(10).OrderByCountDesc()
+ )
+
+ if len(opts.Language) == 0 {
+ searchResult, err := b.inner.Client.Search().
+ Index(b.inner.VersionedIndexName()).
+ Aggregation("language", aggregation).
+ Query(query).
+ Highlight(
+ elastic.NewHighlight().
+ Field("content").
+ NumOfFragments(0). // return all highting content on fragments
+ HighlighterType("fvh"),
+ ).
+ Sort("repo_id", true).
+ From(start).Size(pageSize).
+ Do(ctx)
+ if err != nil {
+ return 0, nil, nil, err
+ }
+
+ return convertResult(searchResult, kw, pageSize)
+ }
+
+ langQuery := elastic.NewMatchQuery("language", opts.Language)
+ countResult, err := b.inner.Client.Search().
+ Index(b.inner.VersionedIndexName()).
+ Aggregation("language", aggregation).
+ Query(query).
+ Size(0). // We only need stats information
+ Do(ctx)
+ if err != nil {
+ return 0, nil, nil, err
+ }
+
+ query = query.Must(langQuery)
+ searchResult, err := b.inner.Client.Search().
+ Index(b.inner.VersionedIndexName()).
+ Query(query).
+ Highlight(
+ elastic.NewHighlight().
+ Field("content").
+ NumOfFragments(0). // return all highting content on fragments
+ HighlighterType("fvh"),
+ ).
+ Sort("repo_id", true).
+ From(start).Size(pageSize).
+ Do(ctx)
+ if err != nil {
+ return 0, nil, nil, err
+ }
+
+ total, hits, _, err := convertResult(searchResult, kw, pageSize)
+
+ return total, hits, extractAggs(countResult), err
+}
diff --git a/modules/indexer/code/elasticsearch/elasticsearch_test.go b/modules/indexer/code/elasticsearch/elasticsearch_test.go
new file mode 100644
index 0000000..c6ba93e
--- /dev/null
+++ b/modules/indexer/code/elasticsearch/elasticsearch_test.go
@@ -0,0 +1,16 @@
+// Copyright 2020 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package elasticsearch
+
+import (
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+)
+
+func TestIndexPos(t *testing.T) {
+ startIdx, endIdx := indexPos("test index start and end", "start", "end")
+ assert.EqualValues(t, 11, startIdx)
+ assert.EqualValues(t, 24, endIdx)
+}
diff --git a/modules/indexer/code/git.go b/modules/indexer/code/git.go
new file mode 100644
index 0000000..c7ffcfd
--- /dev/null
+++ b/modules/indexer/code/git.go
@@ -0,0 +1,199 @@
+// Copyright 2019 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package code
+
+import (
+ "context"
+ "strconv"
+ "strings"
+
+ repo_model "code.gitea.io/gitea/models/repo"
+ "code.gitea.io/gitea/modules/git"
+ "code.gitea.io/gitea/modules/indexer/code/internal"
+ "code.gitea.io/gitea/modules/log"
+ "code.gitea.io/gitea/modules/setting"
+)
+
+func getDefaultBranchSha(ctx context.Context, repo *repo_model.Repository) (string, error) {
+ stdout, _, err := git.NewCommand(ctx, "show-ref", "-s").AddDynamicArguments(git.BranchPrefix + repo.DefaultBranch).RunStdString(&git.RunOpts{Dir: repo.RepoPath()})
+ if err != nil {
+ return "", err
+ }
+ return strings.TrimSpace(stdout), nil
+}
+
+// getRepoChanges returns changes to repo since last indexer update
+func getRepoChanges(ctx context.Context, repo *repo_model.Repository, revision string) (*internal.RepoChanges, error) {
+ status, err := repo_model.GetIndexerStatus(ctx, repo, repo_model.RepoIndexerTypeCode)
+ if err != nil {
+ return nil, err
+ }
+
+ needGenesis := len(status.CommitSha) == 0
+ if !needGenesis {
+ hasAncestorCmd := git.NewCommand(ctx, "merge-base").AddDynamicArguments(status.CommitSha, revision)
+ stdout, _, _ := hasAncestorCmd.RunStdString(&git.RunOpts{Dir: repo.RepoPath()})
+ needGenesis = len(stdout) == 0
+ }
+
+ if needGenesis {
+ return genesisChanges(ctx, repo, revision)
+ }
+ return nonGenesisChanges(ctx, repo, revision)
+}
+
+func isIndexable(entry *git.TreeEntry) bool {
+ if !entry.IsRegular() && !entry.IsExecutable() {
+ return false
+ }
+ name := strings.ToLower(entry.Name())
+ for _, g := range setting.Indexer.ExcludePatterns {
+ if g.Match(name) {
+ return false
+ }
+ }
+ for _, g := range setting.Indexer.IncludePatterns {
+ if g.Match(name) {
+ return true
+ }
+ }
+ return len(setting.Indexer.IncludePatterns) == 0
+}
+
+// parseGitLsTreeOutput parses the output of a `git ls-tree -r --full-name` command
+func parseGitLsTreeOutput(stdout []byte) ([]internal.FileUpdate, error) {
+ entries, err := git.ParseTreeEntries(stdout)
+ if err != nil {
+ return nil, err
+ }
+ idxCount := 0
+ updates := make([]internal.FileUpdate, len(entries))
+ for _, entry := range entries {
+ if isIndexable(entry) {
+ updates[idxCount] = internal.FileUpdate{
+ Filename: entry.Name(),
+ BlobSha: entry.ID.String(),
+ Size: entry.Size(),
+ Sized: true,
+ }
+ idxCount++
+ }
+ }
+ return updates[:idxCount], nil
+}
+
+// genesisChanges get changes to add repo to the indexer for the first time
+func genesisChanges(ctx context.Context, repo *repo_model.Repository, revision string) (*internal.RepoChanges, error) {
+ var changes internal.RepoChanges
+ stdout, _, runErr := git.NewCommand(ctx, "ls-tree", "--full-tree", "-l", "-r").AddDynamicArguments(revision).RunStdBytes(&git.RunOpts{Dir: repo.RepoPath()})
+ if runErr != nil {
+ return nil, runErr
+ }
+
+ var err error
+ changes.Updates, err = parseGitLsTreeOutput(stdout)
+ return &changes, err
+}
+
+// nonGenesisChanges get changes since the previous indexer update
+func nonGenesisChanges(ctx context.Context, repo *repo_model.Repository, revision string) (*internal.RepoChanges, error) {
+ diffCmd := git.NewCommand(ctx, "diff", "--name-status").AddDynamicArguments(repo.CodeIndexerStatus.CommitSha, revision)
+ stdout, _, runErr := diffCmd.RunStdString(&git.RunOpts{Dir: repo.RepoPath()})
+ if runErr != nil {
+ // previous commit sha may have been removed by a force push, so
+ // try rebuilding from scratch
+ log.Warn("git diff: %v", runErr)
+ if err := (*globalIndexer.Load()).Delete(ctx, repo.ID); err != nil {
+ return nil, err
+ }
+ return genesisChanges(ctx, repo, revision)
+ }
+
+ var changes internal.RepoChanges
+ var err error
+ updatedFilenames := make([]string, 0, 10)
+
+ updateChanges := func() error {
+ cmd := git.NewCommand(ctx, "ls-tree", "--full-tree", "-l").AddDynamicArguments(revision).
+ AddDashesAndList(updatedFilenames...)
+ lsTreeStdout, _, err := cmd.RunStdBytes(&git.RunOpts{Dir: repo.RepoPath()})
+ if err != nil {
+ return err
+ }
+
+ updates, err1 := parseGitLsTreeOutput(lsTreeStdout)
+ if err1 != nil {
+ return err1
+ }
+ changes.Updates = append(changes.Updates, updates...)
+ return nil
+ }
+ lines := strings.Split(stdout, "\n")
+ for _, line := range lines {
+ line = strings.TrimSpace(line)
+ if len(line) == 0 {
+ continue
+ }
+ fields := strings.Split(line, "\t")
+ if len(fields) < 2 {
+ log.Warn("Unparsable output for diff --name-status: `%s`)", line)
+ continue
+ }
+ filename := fields[1]
+ if len(filename) == 0 {
+ continue
+ } else if filename[0] == '"' {
+ filename, err = strconv.Unquote(filename)
+ if err != nil {
+ return nil, err
+ }
+ }
+
+ switch status := fields[0][0]; status {
+ case 'M', 'A':
+ updatedFilenames = append(updatedFilenames, filename)
+ case 'D':
+ changes.RemovedFilenames = append(changes.RemovedFilenames, filename)
+ case 'R', 'C':
+ if len(fields) < 3 {
+ log.Warn("Unparsable output for diff --name-status: `%s`)", line)
+ continue
+ }
+ dest := fields[2]
+ if len(dest) == 0 {
+ log.Warn("Unparsable output for diff --name-status: `%s`)", line)
+ continue
+ }
+ if dest[0] == '"' {
+ dest, err = strconv.Unquote(dest)
+ if err != nil {
+ return nil, err
+ }
+ }
+ if status == 'R' {
+ changes.RemovedFilenames = append(changes.RemovedFilenames, filename)
+ }
+ updatedFilenames = append(updatedFilenames, dest)
+ default:
+ log.Warn("Unrecognized status: %c (line=%s)", status, line)
+ }
+
+ // According to https://learn.microsoft.com/en-us/troubleshoot/windows-client/shell-experience/command-line-string-limitation#more-information
+ // the command line length should less than 8191 characters, assume filepath is 256, then 8191/256 = 31, so we use 30
+ if len(updatedFilenames) >= 30 {
+ if err := updateChanges(); err != nil {
+ return nil, err
+ }
+ updatedFilenames = updatedFilenames[0:0]
+ }
+ }
+
+ if len(updatedFilenames) > 0 {
+ if err := updateChanges(); err != nil {
+ return nil, err
+ }
+ }
+
+ return &changes, err
+}
diff --git a/modules/indexer/code/indexer.go b/modules/indexer/code/indexer.go
new file mode 100644
index 0000000..0a8ce27
--- /dev/null
+++ b/modules/indexer/code/indexer.go
@@ -0,0 +1,310 @@
+// Copyright 2016 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package code
+
+import (
+ "context"
+ "os"
+ "runtime/pprof"
+ "slices"
+ "sync/atomic"
+ "time"
+
+ "code.gitea.io/gitea/models/db"
+ repo_model "code.gitea.io/gitea/models/repo"
+ "code.gitea.io/gitea/modules/graceful"
+ "code.gitea.io/gitea/modules/indexer/code/bleve"
+ "code.gitea.io/gitea/modules/indexer/code/elasticsearch"
+ "code.gitea.io/gitea/modules/indexer/code/internal"
+ "code.gitea.io/gitea/modules/log"
+ "code.gitea.io/gitea/modules/process"
+ "code.gitea.io/gitea/modules/queue"
+ "code.gitea.io/gitea/modules/setting"
+)
+
+var (
+ indexerQueue *queue.WorkerPoolQueue[*internal.IndexerData]
+ // globalIndexer is the global indexer, it cannot be nil.
+ // When the real indexer is not ready, it will be a dummy indexer which will return error to explain it's not ready.
+ // So it's always safe use it as *globalIndexer.Load() and call its methods.
+ globalIndexer atomic.Pointer[internal.Indexer]
+ dummyIndexer *internal.Indexer
+)
+
+func init() {
+ i := internal.NewDummyIndexer()
+ dummyIndexer = &i
+ globalIndexer.Store(dummyIndexer)
+}
+
+func index(ctx context.Context, indexer internal.Indexer, repoID int64) error {
+ repo, err := repo_model.GetRepositoryByID(ctx, repoID)
+ if repo_model.IsErrRepoNotExist(err) {
+ return indexer.Delete(ctx, repoID)
+ }
+ if err != nil {
+ return err
+ }
+
+ repoTypes := setting.Indexer.RepoIndexerRepoTypes
+
+ if len(repoTypes) == 0 {
+ repoTypes = []string{"sources"}
+ }
+
+ // skip forks from being indexed if unit is not present
+ if !slices.Contains(repoTypes, "forks") && repo.IsFork {
+ return nil
+ }
+
+ // skip mirrors from being indexed if unit is not present
+ if !slices.Contains(repoTypes, "mirrors") && repo.IsMirror {
+ return nil
+ }
+
+ // skip templates from being indexed if unit is not present
+ if !slices.Contains(repoTypes, "templates") && repo.IsTemplate {
+ return nil
+ }
+
+ // skip regular repos from being indexed if unit is not present
+ if !slices.Contains(repoTypes, "sources") && !repo.IsFork && !repo.IsMirror && !repo.IsTemplate {
+ return nil
+ }
+
+ sha, err := getDefaultBranchSha(ctx, repo)
+ if err != nil {
+ return err
+ }
+ changes, err := getRepoChanges(ctx, repo, sha)
+ if err != nil {
+ return err
+ } else if changes == nil {
+ return nil
+ }
+
+ if err := indexer.Index(ctx, repo, sha, changes); err != nil {
+ return err
+ }
+
+ return repo_model.UpdateIndexerStatus(ctx, repo, repo_model.RepoIndexerTypeCode, sha)
+}
+
+// Init initialize the repo indexer
+func Init() {
+ if !setting.Indexer.RepoIndexerEnabled {
+ (*globalIndexer.Load()).Close()
+ return
+ }
+
+ ctx, cancel, finished := process.GetManager().AddTypedContext(context.Background(), "Service: CodeIndexer", process.SystemProcessType, false)
+
+ graceful.GetManager().RunAtTerminate(func() {
+ select {
+ case <-ctx.Done():
+ return
+ default:
+ }
+ cancel()
+ log.Debug("Closing repository indexer")
+ (*globalIndexer.Load()).Close()
+ log.Info("PID: %d Repository Indexer closed", os.Getpid())
+ finished()
+ })
+
+ waitChannel := make(chan time.Duration, 1)
+
+ // Create the Queue
+ switch setting.Indexer.RepoType {
+ case "bleve", "elasticsearch":
+ handler := func(items ...*internal.IndexerData) (unhandled []*internal.IndexerData) {
+ indexer := *globalIndexer.Load()
+ // make it a process to allow for cancellation (especially during integration tests where no global shutdown happens)
+ batchCtx, _, finished := process.GetManager().AddContext(ctx, "CodeIndexer batch")
+ defer finished()
+ for _, indexerData := range items {
+ log.Trace("IndexerData Process Repo: %d", indexerData.RepoID)
+ if err := index(batchCtx, indexer, indexerData.RepoID); err != nil {
+ unhandled = append(unhandled, indexerData)
+ if !setting.IsInTesting {
+ log.Error("Codes indexer handler: index error for repo %v: %v", indexerData.RepoID, err)
+ }
+ }
+ }
+ return unhandled
+ }
+
+ indexerQueue = queue.CreateUniqueQueue(ctx, "code_indexer", handler)
+ if indexerQueue == nil {
+ log.Fatal("Unable to create codes indexer queue")
+ }
+ default:
+ log.Fatal("Unknown codes indexer type; %s", setting.Indexer.RepoType)
+ }
+
+ go func() {
+ pprof.SetGoroutineLabels(ctx)
+ start := time.Now()
+ var (
+ rIndexer internal.Indexer
+ existed bool
+ err error
+ )
+ switch setting.Indexer.RepoType {
+ case "bleve":
+ log.Info("PID: %d Initializing Repository Indexer at: %s", os.Getpid(), setting.Indexer.RepoPath)
+ defer func() {
+ if err := recover(); err != nil {
+ log.Error("PANIC whilst initializing repository indexer: %v\nStacktrace: %s", err, log.Stack(2))
+ log.Error("The indexer files are likely corrupted and may need to be deleted")
+ log.Error("You can completely remove the \"%s\" directory to make Forgejo recreate the indexes", setting.Indexer.RepoPath)
+ }
+ }()
+
+ rIndexer = bleve.NewIndexer(setting.Indexer.RepoPath)
+ existed, err = rIndexer.Init(ctx)
+ if err != nil {
+ cancel()
+ (*globalIndexer.Load()).Close()
+ close(waitChannel)
+ log.Fatal("PID: %d Unable to initialize the bleve Repository Indexer at path: %s Error: %v", os.Getpid(), setting.Indexer.RepoPath, err)
+ }
+ case "elasticsearch":
+ log.Info("PID: %d Initializing Repository Indexer at: %s", os.Getpid(), setting.Indexer.RepoConnStr)
+ defer func() {
+ if err := recover(); err != nil {
+ log.Error("PANIC whilst initializing repository indexer: %v\nStacktrace: %s", err, log.Stack(2))
+ log.Error("The indexer files are likely corrupted and may need to be deleted")
+ log.Error("You can completely remove the \"%s\" index to make Forgejo recreate the indexes", setting.Indexer.RepoConnStr)
+ }
+ }()
+
+ rIndexer = elasticsearch.NewIndexer(setting.Indexer.RepoConnStr, setting.Indexer.RepoIndexerName)
+ existed, err = rIndexer.Init(ctx)
+ if err != nil {
+ cancel()
+ (*globalIndexer.Load()).Close()
+ close(waitChannel)
+ log.Fatal("PID: %d Unable to initialize the elasticsearch Repository Indexer connstr: %s Error: %v", os.Getpid(), setting.Indexer.RepoConnStr, err)
+ }
+
+ default:
+ log.Fatal("PID: %d Unknown Indexer type: %s", os.Getpid(), setting.Indexer.RepoType)
+ }
+
+ globalIndexer.Store(&rIndexer)
+
+ // Start processing the queue
+ go graceful.GetManager().RunWithCancel(indexerQueue)
+
+ if !existed { // populate the index because it's created for the first time
+ go graceful.GetManager().RunWithShutdownContext(populateRepoIndexer)
+ }
+ select {
+ case waitChannel <- time.Since(start):
+ case <-graceful.GetManager().IsShutdown():
+ }
+
+ close(waitChannel)
+ }()
+
+ if setting.Indexer.StartupTimeout > 0 {
+ go func() {
+ pprof.SetGoroutineLabels(ctx)
+ timeout := setting.Indexer.StartupTimeout
+ if graceful.GetManager().IsChild() && setting.GracefulHammerTime > 0 {
+ timeout += setting.GracefulHammerTime
+ }
+ select {
+ case <-graceful.GetManager().IsShutdown():
+ log.Warn("Shutdown before Repository Indexer completed initialization")
+ cancel()
+ (*globalIndexer.Load()).Close()
+ case duration, ok := <-waitChannel:
+ if !ok {
+ log.Warn("Repository Indexer Initialization failed")
+ cancel()
+ (*globalIndexer.Load()).Close()
+ return
+ }
+ log.Info("Repository Indexer Initialization took %v", duration)
+ case <-time.After(timeout):
+ cancel()
+ (*globalIndexer.Load()).Close()
+ log.Fatal("Repository Indexer Initialization Timed-Out after: %v", timeout)
+ }
+ }()
+ }
+}
+
+// UpdateRepoIndexer update a repository's entries in the indexer
+func UpdateRepoIndexer(repo *repo_model.Repository) {
+ indexData := &internal.IndexerData{RepoID: repo.ID}
+ if err := indexerQueue.Push(indexData); err != nil {
+ log.Error("Update repo index data %v failed: %v", indexData, err)
+ }
+}
+
+// IsAvailable checks if issue indexer is available
+func IsAvailable(ctx context.Context) bool {
+ return (*globalIndexer.Load()).Ping(ctx) == nil
+}
+
+// populateRepoIndexer populate the repo indexer with pre-existing data. This
+// should only be run when the indexer is created for the first time.
+func populateRepoIndexer(ctx context.Context) {
+ log.Info("Populating the repo indexer with existing repositories")
+
+ exist, err := db.IsTableNotEmpty("repository")
+ if err != nil {
+ log.Fatal("System error: %v", err)
+ } else if !exist {
+ return
+ }
+
+ // if there is any existing repo indexer metadata in the DB, delete it
+ // since we are starting afresh. Also, xorm requires deletes to have a
+ // condition, and we want to delete everything, thus 1=1.
+ if err := db.DeleteAllRecords("repo_indexer_status"); err != nil {
+ log.Fatal("System error: %v", err)
+ }
+
+ var maxRepoID int64
+ if maxRepoID, err = db.GetMaxID("repository"); err != nil {
+ log.Fatal("System error: %v", err)
+ }
+
+ // start with the maximum existing repo ID and work backwards, so that we
+ // don't include repos that are created after gitea starts; such repos will
+ // already be added to the indexer, and we don't need to add them again.
+ for maxRepoID > 0 {
+ select {
+ case <-ctx.Done():
+ log.Info("Repository Indexer population shutdown before completion")
+ return
+ default:
+ }
+ ids, err := repo_model.GetUnindexedRepos(ctx, repo_model.RepoIndexerTypeCode, maxRepoID, 0, 50)
+ if err != nil {
+ log.Error("populateRepoIndexer: %v", err)
+ return
+ } else if len(ids) == 0 {
+ break
+ }
+ for _, id := range ids {
+ select {
+ case <-ctx.Done():
+ log.Info("Repository Indexer population shutdown before completion")
+ return
+ default:
+ }
+ if err := indexerQueue.Push(&internal.IndexerData{RepoID: id}); err != nil {
+ log.Error("indexerQueue.Push: %v", err)
+ return
+ }
+ maxRepoID = id - 1
+ }
+ }
+ log.Info("Done (re)populating the repo indexer with existing repositories")
+}
diff --git a/modules/indexer/code/indexer_test.go b/modules/indexer/code/indexer_test.go
new file mode 100644
index 0000000..967aad1
--- /dev/null
+++ b/modules/indexer/code/indexer_test.go
@@ -0,0 +1,145 @@
+// Copyright 2020 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package code
+
+import (
+ "context"
+ "os"
+ "testing"
+
+ "code.gitea.io/gitea/models/db"
+ "code.gitea.io/gitea/models/unittest"
+ "code.gitea.io/gitea/modules/git"
+ "code.gitea.io/gitea/modules/indexer/code/bleve"
+ "code.gitea.io/gitea/modules/indexer/code/elasticsearch"
+ "code.gitea.io/gitea/modules/indexer/code/internal"
+
+ _ "code.gitea.io/gitea/models"
+ _ "code.gitea.io/gitea/models/actions"
+ _ "code.gitea.io/gitea/models/activities"
+
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+)
+
+func TestMain(m *testing.M) {
+ unittest.MainTest(m)
+}
+
+func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
+ t.Run(name, func(t *testing.T) {
+ var repoID int64 = 1
+ err := index(git.DefaultContext, indexer, repoID)
+ require.NoError(t, err)
+ keywords := []struct {
+ RepoIDs []int64
+ Keyword string
+ IDs []int64
+ Langs int
+ }{
+ {
+ RepoIDs: nil,
+ Keyword: "Description",
+ IDs: []int64{repoID},
+ Langs: 1,
+ },
+ {
+ RepoIDs: []int64{2},
+ Keyword: "Description",
+ IDs: []int64{},
+ Langs: 0,
+ },
+ {
+ RepoIDs: nil,
+ Keyword: "Description for",
+ IDs: []int64{repoID},
+ Langs: 1,
+ },
+ {
+ RepoIDs: nil,
+ Keyword: "repo1",
+ IDs: []int64{repoID},
+ Langs: 1,
+ },
+ {
+ RepoIDs: []int64{2},
+ Keyword: "repo1",
+ IDs: []int64{},
+ Langs: 0,
+ },
+ {
+ RepoIDs: nil,
+ Keyword: "non-exist",
+ IDs: []int64{},
+ Langs: 0,
+ },
+ }
+
+ for _, kw := range keywords {
+ t.Run(kw.Keyword, func(t *testing.T) {
+ total, res, langs, err := indexer.Search(context.TODO(), &internal.SearchOptions{
+ RepoIDs: kw.RepoIDs,
+ Keyword: kw.Keyword,
+ Paginator: &db.ListOptions{
+ Page: 1,
+ PageSize: 10,
+ },
+ IsKeywordFuzzy: true,
+ })
+ require.NoError(t, err)
+ assert.Len(t, kw.IDs, int(total))
+ assert.Len(t, langs, kw.Langs)
+
+ ids := make([]int64, 0, len(res))
+ for _, hit := range res {
+ ids = append(ids, hit.RepoID)
+ assert.EqualValues(t, "# repo1\n\nDescription for repo1", hit.Content)
+ }
+ assert.EqualValues(t, kw.IDs, ids)
+ })
+ }
+
+ require.NoError(t, indexer.Delete(context.Background(), repoID))
+ })
+}
+
+func TestBleveIndexAndSearch(t *testing.T) {
+ unittest.PrepareTestEnv(t)
+
+ dir := t.TempDir()
+
+ idx := bleve.NewIndexer(dir)
+ _, err := idx.Init(context.Background())
+ if err != nil {
+ if idx != nil {
+ idx.Close()
+ }
+ assert.FailNow(t, "Unable to create bleve indexer Error: %v", err)
+ }
+ defer idx.Close()
+
+ testIndexer("bleve", t, idx)
+}
+
+func TestESIndexAndSearch(t *testing.T) {
+ unittest.PrepareTestEnv(t)
+
+ u := os.Getenv("TEST_INDEXER_CODE_ES_URL")
+ if u == "" {
+ t.SkipNow()
+ return
+ }
+
+ indexer := elasticsearch.NewIndexer(u, "gitea_codes")
+ if _, err := indexer.Init(context.Background()); err != nil {
+ if indexer != nil {
+ indexer.Close()
+ }
+ assert.FailNow(t, "Unable to init ES indexer Error: %v", err)
+ }
+
+ defer indexer.Close()
+
+ testIndexer("elastic_search", t, indexer)
+}
diff --git a/modules/indexer/code/internal/indexer.go b/modules/indexer/code/internal/indexer.go
new file mode 100644
index 0000000..c259fcd
--- /dev/null
+++ b/modules/indexer/code/internal/indexer.go
@@ -0,0 +1,54 @@
+// Copyright 2023 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package internal
+
+import (
+ "context"
+ "fmt"
+
+ "code.gitea.io/gitea/models/db"
+ repo_model "code.gitea.io/gitea/models/repo"
+ "code.gitea.io/gitea/modules/indexer/internal"
+)
+
+// Indexer defines an interface to index and search code contents
+type Indexer interface {
+ internal.Indexer
+ Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *RepoChanges) error
+ Delete(ctx context.Context, repoID int64) error
+ Search(ctx context.Context, opts *SearchOptions) (int64, []*SearchResult, []*SearchResultLanguages, error)
+}
+
+type SearchOptions struct {
+ RepoIDs []int64
+ Keyword string
+ Language string
+
+ IsKeywordFuzzy bool
+
+ db.Paginator
+}
+
+// NewDummyIndexer returns a dummy indexer
+func NewDummyIndexer() Indexer {
+ return &dummyIndexer{
+ Indexer: internal.NewDummyIndexer(),
+ }
+}
+
+type dummyIndexer struct {
+ internal.Indexer
+}
+
+func (d *dummyIndexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *RepoChanges) error {
+ return fmt.Errorf("indexer is not ready")
+}
+
+func (d *dummyIndexer) Delete(ctx context.Context, repoID int64) error {
+ return fmt.Errorf("indexer is not ready")
+}
+
+func (d *dummyIndexer) Search(ctx context.Context, opts *SearchOptions) (int64, []*SearchResult, []*SearchResultLanguages, error) {
+ return 0, nil, nil, fmt.Errorf("indexer is not ready")
+}
diff --git a/modules/indexer/code/internal/model.go b/modules/indexer/code/internal/model.go
new file mode 100644
index 0000000..f75263c
--- /dev/null
+++ b/modules/indexer/code/internal/model.go
@@ -0,0 +1,44 @@
+// Copyright 2023 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package internal
+
+import "code.gitea.io/gitea/modules/timeutil"
+
+type FileUpdate struct {
+ Filename string
+ BlobSha string
+ Size int64
+ Sized bool
+}
+
+// RepoChanges changes (file additions/updates/removals) to a repo
+type RepoChanges struct {
+ Updates []FileUpdate
+ RemovedFilenames []string
+}
+
+// IndexerData represents data stored in the code indexer
+type IndexerData struct {
+ RepoID int64
+}
+
+// SearchResult result of performing a search in a repo
+type SearchResult struct {
+ RepoID int64
+ StartIndex int
+ EndIndex int
+ Filename string
+ Content string
+ CommitID string
+ UpdatedUnix timeutil.TimeStamp
+ Language string
+ Color string
+}
+
+// SearchResultLanguages result of top languages count in search results
+type SearchResultLanguages struct {
+ Language string
+ Color string
+ Count int
+}
diff --git a/modules/indexer/code/internal/util.go b/modules/indexer/code/internal/util.go
new file mode 100644
index 0000000..689c4f4
--- /dev/null
+++ b/modules/indexer/code/internal/util.go
@@ -0,0 +1,32 @@
+// Copyright 2023 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package internal
+
+import (
+ "strings"
+
+ "code.gitea.io/gitea/modules/indexer/internal"
+ "code.gitea.io/gitea/modules/log"
+)
+
+func FilenameIndexerID(repoID int64, filename string) string {
+ return internal.Base36(repoID) + "_" + filename
+}
+
+func ParseIndexerID(indexerID string) (int64, string) {
+ index := strings.IndexByte(indexerID, '_')
+ if index == -1 {
+ log.Error("Unexpected ID in repo indexer: %s", indexerID)
+ }
+ repoID, _ := internal.ParseBase36(indexerID[:index])
+ return repoID, indexerID[index+1:]
+}
+
+func FilenameOfIndexerID(indexerID string) string {
+ index := strings.IndexByte(indexerID, '_')
+ if index == -1 {
+ log.Error("Unexpected ID in repo indexer: %s", indexerID)
+ }
+ return indexerID[index+1:]
+}
diff --git a/modules/indexer/code/search.go b/modules/indexer/code/search.go
new file mode 100644
index 0000000..f45907a
--- /dev/null
+++ b/modules/indexer/code/search.go
@@ -0,0 +1,228 @@
+// Copyright 2017 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package code
+
+import (
+ "bytes"
+ "context"
+ "html/template"
+ "strings"
+
+ "code.gitea.io/gitea/modules/highlight"
+ "code.gitea.io/gitea/modules/indexer/code/internal"
+ "code.gitea.io/gitea/modules/timeutil"
+ "code.gitea.io/gitea/services/gitdiff"
+)
+
+// Result a search result to display
+type Result struct {
+ RepoID int64
+ Filename string
+ CommitID string
+ UpdatedUnix timeutil.TimeStamp
+ Language string
+ Color string
+ Lines []ResultLine
+}
+
+type ResultLine struct {
+ Num int
+ FormattedContent template.HTML
+}
+
+type SearchResultLanguages = internal.SearchResultLanguages
+
+type SearchOptions = internal.SearchOptions
+
+func indices(content string, selectionStartIndex, selectionEndIndex int) (int, int) {
+ startIndex := selectionStartIndex
+ numLinesBefore := 0
+ for ; startIndex > 0; startIndex-- {
+ if content[startIndex-1] == '\n' {
+ if numLinesBefore == 1 {
+ break
+ }
+ numLinesBefore++
+ }
+ }
+
+ endIndex := selectionEndIndex
+ numLinesAfter := 0
+ for ; endIndex < len(content); endIndex++ {
+ if content[endIndex] == '\n' {
+ if numLinesAfter == 1 {
+ break
+ }
+ numLinesAfter++
+ }
+ }
+
+ return startIndex, endIndex
+}
+
+func writeStrings(buf *bytes.Buffer, strs ...string) error {
+ for _, s := range strs {
+ _, err := buf.WriteString(s)
+ if err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+const (
+ highlightTagStart = "<span class=\"search-highlight\">"
+ highlightTagEnd = "</span>"
+)
+
+func HighlightSearchResultCode(filename string, lineNums []int, highlightRanges [][3]int, code string) []ResultLine {
+ hcd := gitdiff.NewHighlightCodeDiff()
+ hcd.CollectUsedRunes(code)
+ startTag, endTag := hcd.NextPlaceholder(), hcd.NextPlaceholder()
+ hcd.PlaceholderTokenMap[startTag] = highlightTagStart
+ hcd.PlaceholderTokenMap[endTag] = highlightTagEnd
+
+ // we should highlight the whole code block first, otherwise it doesn't work well with multiple line highlighting
+ hl, _ := highlight.Code(filename, "", code)
+ conv := hcd.ConvertToPlaceholders(string(hl))
+ convLines := strings.Split(conv, "\n")
+
+ // each highlightRange is of the form [line number, start pos, end pos]
+ for _, highlightRange := range highlightRanges {
+ ln, start, end := highlightRange[0], highlightRange[1], highlightRange[2]
+ line := convLines[ln]
+ if line == "" || len(line) <= start || len(line) < end {
+ continue
+ }
+
+ sb := strings.Builder{}
+ count := -1
+ isOpen := false
+ for _, r := range line {
+ if token, ok := hcd.PlaceholderTokenMap[r];
+ // token was not found
+ !ok ||
+ // token was marked as used
+ token == "" ||
+ // the token is not an valid html tag emitted by chroma
+ !(len(token) > 6 && (token[0:5] == "<span" || token[0:6] == "</span")) {
+ count++
+ } else if !isOpen {
+ // open the tag only after all other placeholders
+ sb.WriteRune(r)
+ continue
+ } else if isOpen && count < end {
+ // if the tag is open, but a placeholder exists in between
+ // close the tag
+ sb.WriteRune(endTag)
+ // write the placeholder
+ sb.WriteRune(r)
+ // reopen the tag
+ sb.WriteRune(startTag)
+ continue
+ }
+
+ switch count {
+ case end:
+ // if tag is not open, no need to close
+ if !isOpen {
+ break
+ }
+ sb.WriteRune(endTag)
+ isOpen = false
+ case start:
+ // if tag is open, do not open again
+ if isOpen {
+ break
+ }
+ isOpen = true
+ sb.WriteRune(startTag)
+ }
+
+ sb.WriteRune(r)
+ }
+ if isOpen {
+ sb.WriteRune(endTag)
+ }
+ convLines[ln] = sb.String()
+ }
+ conv = strings.Join(convLines, "\n")
+
+ highlightedLines := strings.Split(hcd.Recover(conv), "\n")
+ // The lineNums outputted by highlight.Code might not match the original lineNums, because "highlight" removes the last `\n`
+ lines := make([]ResultLine, min(len(highlightedLines), len(lineNums)))
+ for i := 0; i < len(lines); i++ {
+ lines[i].Num = lineNums[i]
+ lines[i].FormattedContent = template.HTML(highlightedLines[i])
+ }
+ return lines
+}
+
+func searchResult(result *internal.SearchResult, startIndex, endIndex int) (*Result, error) {
+ startLineNum := 1 + strings.Count(result.Content[:startIndex], "\n")
+
+ var formattedLinesBuffer bytes.Buffer
+
+ contentLines := strings.SplitAfter(result.Content[startIndex:endIndex], "\n")
+ lineNums := make([]int, 0, len(contentLines))
+ index := startIndex
+ var highlightRanges [][3]int
+ for i, line := range contentLines {
+ var err error
+ if index < result.EndIndex &&
+ result.StartIndex < index+len(line) &&
+ result.StartIndex < result.EndIndex {
+ openActiveIndex := max(result.StartIndex-index, 0)
+ closeActiveIndex := min(result.EndIndex-index, len(line))
+ highlightRanges = append(highlightRanges, [3]int{i, openActiveIndex, closeActiveIndex})
+ err = writeStrings(&formattedLinesBuffer,
+ line[:openActiveIndex],
+ line[openActiveIndex:closeActiveIndex],
+ line[closeActiveIndex:],
+ )
+ } else {
+ err = writeStrings(&formattedLinesBuffer, line)
+ }
+ if err != nil {
+ return nil, err
+ }
+
+ lineNums = append(lineNums, startLineNum+i)
+ index += len(line)
+ }
+
+ return &Result{
+ RepoID: result.RepoID,
+ Filename: result.Filename,
+ CommitID: result.CommitID,
+ UpdatedUnix: result.UpdatedUnix,
+ Language: result.Language,
+ Color: result.Color,
+ Lines: HighlightSearchResultCode(result.Filename, lineNums, highlightRanges, formattedLinesBuffer.String()),
+ }, nil
+}
+
+// PerformSearch perform a search on a repository
+// if isFuzzy is true set the Damerau-Levenshtein distance from 0 to 2
+func PerformSearch(ctx context.Context, opts *SearchOptions) (int, []*Result, []*SearchResultLanguages, error) {
+ if opts == nil || len(opts.Keyword) == 0 {
+ return 0, nil, nil, nil
+ }
+
+ total, results, resultLanguages, err := (*globalIndexer.Load()).Search(ctx, opts)
+ if err != nil {
+ return 0, nil, nil, err
+ }
+
+ displayResults := make([]*Result, len(results))
+
+ for i, result := range results {
+ startIndex, endIndex := indices(result.Content, result.StartIndex, result.EndIndex)
+ displayResults[i], err = searchResult(result, startIndex, endIndex)
+ if err != nil {
+ return 0, nil, nil, err
+ }
+ }
+ return int(total), displayResults, resultLanguages, nil
+}