diff options
author | Daniel Baumann <daniel@debian.org> | 2024-10-18 20:33:49 +0200 |
---|---|---|
committer | Daniel Baumann <daniel@debian.org> | 2024-12-12 23:57:56 +0100 |
commit | e68b9d00a6e05b3a941f63ffb696f91e554ac5ec (patch) | |
tree | 97775d6c13b0f416af55314eb6a89ef792474615 /modules/indexer/code | |
parent | Initial commit. (diff) | |
download | forgejo-e68b9d00a6e05b3a941f63ffb696f91e554ac5ec.tar.xz forgejo-e68b9d00a6e05b3a941f63ffb696f91e554ac5ec.zip |
Adding upstream version 9.0.3.
Signed-off-by: Daniel Baumann <daniel@debian.org>
Diffstat (limited to '')
-rw-r--r-- | modules/indexer/code/bleve/bleve.go | 354 | ||||
-rw-r--r-- | modules/indexer/code/elasticsearch/elasticsearch.go | 388 | ||||
-rw-r--r-- | modules/indexer/code/elasticsearch/elasticsearch_test.go | 16 | ||||
-rw-r--r-- | modules/indexer/code/git.go | 199 | ||||
-rw-r--r-- | modules/indexer/code/indexer.go | 310 | ||||
-rw-r--r-- | modules/indexer/code/indexer_test.go | 145 | ||||
-rw-r--r-- | modules/indexer/code/internal/indexer.go | 54 | ||||
-rw-r--r-- | modules/indexer/code/internal/model.go | 44 | ||||
-rw-r--r-- | modules/indexer/code/internal/util.go | 32 | ||||
-rw-r--r-- | modules/indexer/code/search.go | 228 |
10 files changed, 1770 insertions, 0 deletions
diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go new file mode 100644 index 0000000..cf9fcbd --- /dev/null +++ b/modules/indexer/code/bleve/bleve.go @@ -0,0 +1,354 @@ +// Copyright 2019 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package bleve + +import ( + "bufio" + "context" + "fmt" + "io" + "strconv" + "strings" + "time" + + repo_model "code.gitea.io/gitea/models/repo" + "code.gitea.io/gitea/modules/analyze" + "code.gitea.io/gitea/modules/charset" + "code.gitea.io/gitea/modules/git" + "code.gitea.io/gitea/modules/gitrepo" + "code.gitea.io/gitea/modules/indexer/code/internal" + indexer_internal "code.gitea.io/gitea/modules/indexer/internal" + inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve" + "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/modules/timeutil" + "code.gitea.io/gitea/modules/typesniffer" + + "github.com/blevesearch/bleve/v2" + analyzer_custom "github.com/blevesearch/bleve/v2/analysis/analyzer/custom" + analyzer_keyword "github.com/blevesearch/bleve/v2/analysis/analyzer/keyword" + "github.com/blevesearch/bleve/v2/analysis/token/camelcase" + "github.com/blevesearch/bleve/v2/analysis/token/lowercase" + "github.com/blevesearch/bleve/v2/analysis/token/unicodenorm" + "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode" + "github.com/blevesearch/bleve/v2/mapping" + "github.com/blevesearch/bleve/v2/search/query" + "github.com/go-enry/go-enry/v2" +) + +const ( + unicodeNormalizeName = "unicodeNormalize" + maxBatchSize = 16 + // fuzzyDenominator determines the levenshtein distance per each character of a keyword + fuzzyDenominator = 4 + // see https://github.com/blevesearch/bleve/issues/1563#issuecomment-786822311 + maxFuzziness = 2 +) + +func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error { + return m.AddCustomTokenFilter(unicodeNormalizeName, map[string]any{ + "type": unicodenorm.Name, + "form": unicodenorm.NFC, + }) +} + +// RepoIndexerData data stored in the repo indexer +type RepoIndexerData struct { + RepoID int64 + CommitID string + Content string + Language string + UpdatedAt time.Time +} + +// Type returns the document type, for bleve's mapping.Classifier interface. +func (d *RepoIndexerData) Type() string { + return repoIndexerDocType +} + +const ( + repoIndexerAnalyzer = "repoIndexerAnalyzer" + repoIndexerDocType = "repoIndexerDocType" + repoIndexerLatestVersion = 6 +) + +// generateBleveIndexMapping generates a bleve index mapping for the repo indexer +func generateBleveIndexMapping() (mapping.IndexMapping, error) { + docMapping := bleve.NewDocumentMapping() + numericFieldMapping := bleve.NewNumericFieldMapping() + numericFieldMapping.IncludeInAll = false + docMapping.AddFieldMappingsAt("RepoID", numericFieldMapping) + + textFieldMapping := bleve.NewTextFieldMapping() + textFieldMapping.IncludeInAll = false + docMapping.AddFieldMappingsAt("Content", textFieldMapping) + + termFieldMapping := bleve.NewTextFieldMapping() + termFieldMapping.IncludeInAll = false + termFieldMapping.Analyzer = analyzer_keyword.Name + docMapping.AddFieldMappingsAt("Language", termFieldMapping) + docMapping.AddFieldMappingsAt("CommitID", termFieldMapping) + + timeFieldMapping := bleve.NewDateTimeFieldMapping() + timeFieldMapping.IncludeInAll = false + docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping) + + mapping := bleve.NewIndexMapping() + if err := addUnicodeNormalizeTokenFilter(mapping); err != nil { + return nil, err + } else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{ + "type": analyzer_custom.Name, + "char_filters": []string{}, + "tokenizer": unicode.Name, + "token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name}, + }); err != nil { + return nil, err + } + mapping.DefaultAnalyzer = repoIndexerAnalyzer + mapping.AddDocumentMapping(repoIndexerDocType, docMapping) + mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping()) + + return mapping, nil +} + +var _ internal.Indexer = &Indexer{} + +// Indexer represents a bleve indexer implementation +type Indexer struct { + inner *inner_bleve.Indexer + indexer_internal.Indexer // do not composite inner_bleve.Indexer directly to avoid exposing too much +} + +// NewIndexer creates a new bleve local indexer +func NewIndexer(indexDir string) *Indexer { + inner := inner_bleve.NewIndexer(indexDir, repoIndexerLatestVersion, generateBleveIndexMapping) + return &Indexer{ + Indexer: inner, + inner: inner, + } +} + +func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, commitSha string, + update internal.FileUpdate, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch, +) error { + // Ignore vendored files in code search + if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) { + return nil + } + + size := update.Size + + var err error + if !update.Sized { + var stdout string + stdout, _, err = git.NewCommand(ctx, "cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(&git.RunOpts{Dir: repo.RepoPath()}) + if err != nil { + return err + } + if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil { + return fmt.Errorf("misformatted git cat-file output: %w", err) + } + } + + if size > setting.Indexer.MaxIndexerFileSize { + return b.addDelete(update.Filename, repo, batch) + } + + if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil { + return err + } + + _, _, size, err = git.ReadBatchLine(batchReader) + if err != nil { + return err + } + + fileContents, err := io.ReadAll(io.LimitReader(batchReader, size)) + if err != nil { + return err + } else if !typesniffer.DetectContentType(fileContents).IsText() { + // FIXME: UTF-16 files will probably fail here + return nil + } + + if _, err = batchReader.Discard(1); err != nil { + return err + } + id := internal.FilenameIndexerID(repo.ID, update.Filename) + return batch.Index(id, &RepoIndexerData{ + RepoID: repo.ID, + CommitID: commitSha, + Content: string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})), + Language: analyze.GetCodeLanguage(update.Filename, fileContents), + UpdatedAt: time.Now().UTC(), + }) +} + +func (b *Indexer) addDelete(filename string, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch) error { + id := internal.FilenameIndexerID(repo.ID, filename) + return batch.Delete(id) +} + +// Index indexes the data +func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error { + batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize) + if len(changes.Updates) > 0 { + r, err := gitrepo.OpenRepository(ctx, repo) + if err != nil { + return err + } + defer r.Close() + gitBatch, err := r.NewBatch(ctx) + if err != nil { + return err + } + defer gitBatch.Close() + + for _, update := range changes.Updates { + if err := b.addUpdate(ctx, gitBatch.Writer, gitBatch.Reader, sha, update, repo, batch); err != nil { + return err + } + } + gitBatch.Close() + } + for _, filename := range changes.RemovedFilenames { + if err := b.addDelete(filename, repo, batch); err != nil { + return err + } + } + return batch.Flush() +} + +// Delete deletes indexes by ids +func (b *Indexer) Delete(_ context.Context, repoID int64) error { + query := inner_bleve.NumericEqualityQuery(repoID, "RepoID") + searchRequest := bleve.NewSearchRequestOptions(query, 2147483647, 0, false) + result, err := b.inner.Indexer.Search(searchRequest) + if err != nil { + return err + } + batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize) + for _, hit := range result.Hits { + if err = batch.Delete(hit.ID); err != nil { + return err + } + } + return batch.Flush() +} + +// Search searches for files in the specified repo. +// Returns the matching file-paths +func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) { + var ( + indexerQuery query.Query + keywordQuery query.Query + ) + + phraseQuery := bleve.NewMatchPhraseQuery(opts.Keyword) + phraseQuery.FieldVal = "Content" + phraseQuery.Analyzer = repoIndexerAnalyzer + keywordQuery = phraseQuery + if opts.IsKeywordFuzzy { + phraseQuery.Fuzziness = min(maxFuzziness, len(opts.Keyword)/fuzzyDenominator) + } + + if len(opts.RepoIDs) > 0 { + repoQueries := make([]query.Query, 0, len(opts.RepoIDs)) + for _, repoID := range opts.RepoIDs { + repoQueries = append(repoQueries, inner_bleve.NumericEqualityQuery(repoID, "RepoID")) + } + + indexerQuery = bleve.NewConjunctionQuery( + bleve.NewDisjunctionQuery(repoQueries...), + keywordQuery, + ) + } else { + indexerQuery = keywordQuery + } + + // Save for reuse without language filter + facetQuery := indexerQuery + if len(opts.Language) > 0 { + languageQuery := bleve.NewMatchQuery(opts.Language) + languageQuery.FieldVal = "Language" + languageQuery.Analyzer = analyzer_keyword.Name + + indexerQuery = bleve.NewConjunctionQuery( + indexerQuery, + languageQuery, + ) + } + + from, pageSize := opts.GetSkipTake() + searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false) + searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"} + searchRequest.IncludeLocations = true + + if len(opts.Language) == 0 { + searchRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10)) + } + + result, err := b.inner.Indexer.SearchInContext(ctx, searchRequest) + if err != nil { + return 0, nil, nil, err + } + + total := int64(result.Total) + + searchResults := make([]*internal.SearchResult, len(result.Hits)) + for i, hit := range result.Hits { + startIndex, endIndex := -1, -1 + for _, locations := range hit.Locations["Content"] { + location := locations[0] + locationStart := int(location.Start) + locationEnd := int(location.End) + if startIndex < 0 || locationStart < startIndex { + startIndex = locationStart + } + if endIndex < 0 || locationEnd > endIndex { + endIndex = locationEnd + } + } + language := hit.Fields["Language"].(string) + var updatedUnix timeutil.TimeStamp + if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil { + updatedUnix = timeutil.TimeStamp(t.Unix()) + } + searchResults[i] = &internal.SearchResult{ + RepoID: int64(hit.Fields["RepoID"].(float64)), + StartIndex: startIndex, + EndIndex: endIndex, + Filename: internal.FilenameOfIndexerID(hit.ID), + Content: hit.Fields["Content"].(string), + CommitID: hit.Fields["CommitID"].(string), + UpdatedUnix: updatedUnix, + Language: language, + Color: enry.GetColor(language), + } + } + + searchResultLanguages := make([]*internal.SearchResultLanguages, 0, 10) + if len(opts.Language) > 0 { + // Use separate query to go get all language counts + facetRequest := bleve.NewSearchRequestOptions(facetQuery, 1, 0, false) + facetRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"} + facetRequest.IncludeLocations = true + facetRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10)) + + if result, err = b.inner.Indexer.Search(facetRequest); err != nil { + return 0, nil, nil, err + } + } + languagesFacet := result.Facets["languages"] + for _, term := range languagesFacet.Terms.Terms() { + if len(term.Term) == 0 { + continue + } + searchResultLanguages = append(searchResultLanguages, &internal.SearchResultLanguages{ + Language: term.Term, + Color: enry.GetColor(term.Term), + Count: term.Count, + }) + } + return total, searchResults, searchResultLanguages, nil +} diff --git a/modules/indexer/code/elasticsearch/elasticsearch.go b/modules/indexer/code/elasticsearch/elasticsearch.go new file mode 100644 index 0000000..aee5668 --- /dev/null +++ b/modules/indexer/code/elasticsearch/elasticsearch.go @@ -0,0 +1,388 @@ +// Copyright 2020 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package elasticsearch + +import ( + "bufio" + "context" + "fmt" + "io" + "strconv" + "strings" + + repo_model "code.gitea.io/gitea/models/repo" + "code.gitea.io/gitea/modules/analyze" + "code.gitea.io/gitea/modules/charset" + "code.gitea.io/gitea/modules/git" + "code.gitea.io/gitea/modules/gitrepo" + "code.gitea.io/gitea/modules/indexer/code/internal" + indexer_internal "code.gitea.io/gitea/modules/indexer/internal" + inner_elasticsearch "code.gitea.io/gitea/modules/indexer/internal/elasticsearch" + "code.gitea.io/gitea/modules/json" + "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/modules/timeutil" + "code.gitea.io/gitea/modules/typesniffer" + + "github.com/go-enry/go-enry/v2" + "github.com/olivere/elastic/v7" +) + +const ( + esRepoIndexerLatestVersion = 1 + // multi-match-types, currently only 2 types are used + // Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types + esMultiMatchTypeBestFields = "best_fields" + esMultiMatchTypePhrasePrefix = "phrase_prefix" +) + +var _ internal.Indexer = &Indexer{} + +// Indexer implements Indexer interface +type Indexer struct { + inner *inner_elasticsearch.Indexer + indexer_internal.Indexer // do not composite inner_elasticsearch.Indexer directly to avoid exposing too much +} + +// NewIndexer creates a new elasticsearch indexer +func NewIndexer(url, indexerName string) *Indexer { + inner := inner_elasticsearch.NewIndexer(url, indexerName, esRepoIndexerLatestVersion, defaultMapping) + indexer := &Indexer{ + inner: inner, + Indexer: inner, + } + return indexer +} + +const ( + defaultMapping = `{ + "mappings": { + "properties": { + "repo_id": { + "type": "long", + "index": true + }, + "content": { + "type": "text", + "term_vector": "with_positions_offsets", + "index": true + }, + "commit_id": { + "type": "keyword", + "index": true + }, + "language": { + "type": "keyword", + "index": true + }, + "updated_at": { + "type": "long", + "index": true + } + } + } + }` +) + +func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, sha string, update internal.FileUpdate, repo *repo_model.Repository) ([]elastic.BulkableRequest, error) { + // Ignore vendored files in code search + if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) { + return nil, nil + } + + size := update.Size + var err error + if !update.Sized { + var stdout string + stdout, _, err = git.NewCommand(ctx, "cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(&git.RunOpts{Dir: repo.RepoPath()}) + if err != nil { + return nil, err + } + if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil { + return nil, fmt.Errorf("misformatted git cat-file output: %w", err) + } + } + + if size > setting.Indexer.MaxIndexerFileSize { + return []elastic.BulkableRequest{b.addDelete(update.Filename, repo)}, nil + } + + if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil { + return nil, err + } + + _, _, size, err = git.ReadBatchLine(batchReader) + if err != nil { + return nil, err + } + + fileContents, err := io.ReadAll(io.LimitReader(batchReader, size)) + if err != nil { + return nil, err + } else if !typesniffer.DetectContentType(fileContents).IsText() { + // FIXME: UTF-16 files will probably fail here + return nil, nil + } + + if _, err = batchReader.Discard(1); err != nil { + return nil, err + } + id := internal.FilenameIndexerID(repo.ID, update.Filename) + + return []elastic.BulkableRequest{ + elastic.NewBulkIndexRequest(). + Index(b.inner.VersionedIndexName()). + Id(id). + Doc(map[string]any{ + "repo_id": repo.ID, + "content": string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})), + "commit_id": sha, + "language": analyze.GetCodeLanguage(update.Filename, fileContents), + "updated_at": timeutil.TimeStampNow(), + }), + }, nil +} + +func (b *Indexer) addDelete(filename string, repo *repo_model.Repository) elastic.BulkableRequest { + id := internal.FilenameIndexerID(repo.ID, filename) + return elastic.NewBulkDeleteRequest(). + Index(b.inner.VersionedIndexName()). + Id(id) +} + +// Index will save the index data +func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error { + reqs := make([]elastic.BulkableRequest, 0) + if len(changes.Updates) > 0 { + r, err := gitrepo.OpenRepository(ctx, repo) + if err != nil { + return err + } + defer r.Close() + batch, err := r.NewBatch(ctx) + if err != nil { + return err + } + defer batch.Close() + + for _, update := range changes.Updates { + updateReqs, err := b.addUpdate(ctx, batch.Writer, batch.Reader, sha, update, repo) + if err != nil { + return err + } + if len(updateReqs) > 0 { + reqs = append(reqs, updateReqs...) + } + } + batch.Close() + } + + for _, filename := range changes.RemovedFilenames { + reqs = append(reqs, b.addDelete(filename, repo)) + } + + if len(reqs) > 0 { + esBatchSize := 50 + + for i := 0; i < len(reqs); i += esBatchSize { + _, err := b.inner.Client.Bulk(). + Index(b.inner.VersionedIndexName()). + Add(reqs[i:min(i+esBatchSize, len(reqs))]...). + Do(ctx) + if err != nil { + return err + } + } + } + return nil +} + +// Delete entries by repoId +func (b *Indexer) Delete(ctx context.Context, repoID int64) error { + if err := b.doDelete(ctx, repoID); err != nil { + // Maybe there is a conflict during the delete operation, so we should retry after a refresh + log.Warn("Deletion of entries of repo %v within index %v was erroneus. Trying to refresh index before trying again", repoID, b.inner.VersionedIndexName(), err) + if err := b.refreshIndex(ctx); err != nil { + return err + } + if err := b.doDelete(ctx, repoID); err != nil { + log.Error("Could not delete entries of repo %v within index %v", repoID, b.inner.VersionedIndexName()) + return err + } + } + return nil +} + +func (b *Indexer) refreshIndex(ctx context.Context) error { + if _, err := b.inner.Client.Refresh(b.inner.VersionedIndexName()).Do(ctx); err != nil { + log.Error("Error while trying to refresh index %v", b.inner.VersionedIndexName(), err) + return err + } + + return nil +} + +// Delete entries by repoId +func (b *Indexer) doDelete(ctx context.Context, repoID int64) error { + _, err := b.inner.Client.DeleteByQuery(b.inner.VersionedIndexName()). + Query(elastic.NewTermsQuery("repo_id", repoID)). + Do(ctx) + return err +} + +// indexPos find words positions for start and the following end on content. It will +// return the beginning position of the first start and the ending position of the +// first end following the start string. +// If not found any of the positions, it will return -1, -1. +func indexPos(content, start, end string) (int, int) { + startIdx := strings.Index(content, start) + if startIdx < 0 { + return -1, -1 + } + endIdx := strings.Index(content[startIdx+len(start):], end) + if endIdx < 0 { + return -1, -1 + } + return startIdx, startIdx + len(start) + endIdx + len(end) +} + +func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) { + hits := make([]*internal.SearchResult, 0, pageSize) + for _, hit := range searchResult.Hits.Hits { + // FIXME: There is no way to get the position the keyword on the content currently on the same request. + // So we get it from content, this may made the query slower. See + // https://discuss.elastic.co/t/fetching-position-of-keyword-in-matched-document/94291 + var startIndex, endIndex int + c, ok := hit.Highlight["content"] + if ok && len(c) > 0 { + // FIXME: Since the highlighting content will include <em> and </em> for the keywords, + // now we should find the positions. But how to avoid html content which contains the + // <em> and </em> tags? If elastic search has handled that? + startIndex, endIndex = indexPos(c[0], "<em>", "</em>") + if startIndex == -1 { + panic(fmt.Sprintf("1===%s,,,%#v,,,%s", kw, hit.Highlight, c[0])) + } + } else { + panic(fmt.Sprintf("2===%#v", hit.Highlight)) + } + + repoID, fileName := internal.ParseIndexerID(hit.Id) + res := make(map[string]any) + if err := json.Unmarshal(hit.Source, &res); err != nil { + return 0, nil, nil, err + } + + language := res["language"].(string) + + hits = append(hits, &internal.SearchResult{ + RepoID: repoID, + Filename: fileName, + CommitID: res["commit_id"].(string), + Content: res["content"].(string), + UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)), + Language: language, + StartIndex: startIndex, + EndIndex: endIndex - 9, // remove the length <em></em> since we give Content the original data + Color: enry.GetColor(language), + }) + } + + return searchResult.TotalHits(), hits, extractAggs(searchResult), nil +} + +func extractAggs(searchResult *elastic.SearchResult) []*internal.SearchResultLanguages { + var searchResultLanguages []*internal.SearchResultLanguages + agg, found := searchResult.Aggregations.Terms("language") + if found { + searchResultLanguages = make([]*internal.SearchResultLanguages, 0, 10) + + for _, bucket := range agg.Buckets { + searchResultLanguages = append(searchResultLanguages, &internal.SearchResultLanguages{ + Language: bucket.Key.(string), + Color: enry.GetColor(bucket.Key.(string)), + Count: int(bucket.DocCount), + }) + } + } + return searchResultLanguages +} + +// Search searches for codes and language stats by given conditions. +func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) { + searchType := esMultiMatchTypePhrasePrefix + if opts.IsKeywordFuzzy { + searchType = esMultiMatchTypeBestFields + } + + kwQuery := elastic.NewMultiMatchQuery(opts.Keyword, "content").Type(searchType) + query := elastic.NewBoolQuery() + query = query.Must(kwQuery) + if len(opts.RepoIDs) > 0 { + repoStrs := make([]any, 0, len(opts.RepoIDs)) + for _, repoID := range opts.RepoIDs { + repoStrs = append(repoStrs, repoID) + } + repoQuery := elastic.NewTermsQuery("repo_id", repoStrs...) + query = query.Must(repoQuery) + } + + var ( + start, pageSize = opts.GetSkipTake() + kw = "<em>" + opts.Keyword + "</em>" + aggregation = elastic.NewTermsAggregation().Field("language").Size(10).OrderByCountDesc() + ) + + if len(opts.Language) == 0 { + searchResult, err := b.inner.Client.Search(). + Index(b.inner.VersionedIndexName()). + Aggregation("language", aggregation). + Query(query). + Highlight( + elastic.NewHighlight(). + Field("content"). + NumOfFragments(0). // return all highting content on fragments + HighlighterType("fvh"), + ). + Sort("repo_id", true). + From(start).Size(pageSize). + Do(ctx) + if err != nil { + return 0, nil, nil, err + } + + return convertResult(searchResult, kw, pageSize) + } + + langQuery := elastic.NewMatchQuery("language", opts.Language) + countResult, err := b.inner.Client.Search(). + Index(b.inner.VersionedIndexName()). + Aggregation("language", aggregation). + Query(query). + Size(0). // We only need stats information + Do(ctx) + if err != nil { + return 0, nil, nil, err + } + + query = query.Must(langQuery) + searchResult, err := b.inner.Client.Search(). + Index(b.inner.VersionedIndexName()). + Query(query). + Highlight( + elastic.NewHighlight(). + Field("content"). + NumOfFragments(0). // return all highting content on fragments + HighlighterType("fvh"), + ). + Sort("repo_id", true). + From(start).Size(pageSize). + Do(ctx) + if err != nil { + return 0, nil, nil, err + } + + total, hits, _, err := convertResult(searchResult, kw, pageSize) + + return total, hits, extractAggs(countResult), err +} diff --git a/modules/indexer/code/elasticsearch/elasticsearch_test.go b/modules/indexer/code/elasticsearch/elasticsearch_test.go new file mode 100644 index 0000000..c6ba93e --- /dev/null +++ b/modules/indexer/code/elasticsearch/elasticsearch_test.go @@ -0,0 +1,16 @@ +// Copyright 2020 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package elasticsearch + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestIndexPos(t *testing.T) { + startIdx, endIdx := indexPos("test index start and end", "start", "end") + assert.EqualValues(t, 11, startIdx) + assert.EqualValues(t, 24, endIdx) +} diff --git a/modules/indexer/code/git.go b/modules/indexer/code/git.go new file mode 100644 index 0000000..c7ffcfd --- /dev/null +++ b/modules/indexer/code/git.go @@ -0,0 +1,199 @@ +// Copyright 2019 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package code + +import ( + "context" + "strconv" + "strings" + + repo_model "code.gitea.io/gitea/models/repo" + "code.gitea.io/gitea/modules/git" + "code.gitea.io/gitea/modules/indexer/code/internal" + "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/setting" +) + +func getDefaultBranchSha(ctx context.Context, repo *repo_model.Repository) (string, error) { + stdout, _, err := git.NewCommand(ctx, "show-ref", "-s").AddDynamicArguments(git.BranchPrefix + repo.DefaultBranch).RunStdString(&git.RunOpts{Dir: repo.RepoPath()}) + if err != nil { + return "", err + } + return strings.TrimSpace(stdout), nil +} + +// getRepoChanges returns changes to repo since last indexer update +func getRepoChanges(ctx context.Context, repo *repo_model.Repository, revision string) (*internal.RepoChanges, error) { + status, err := repo_model.GetIndexerStatus(ctx, repo, repo_model.RepoIndexerTypeCode) + if err != nil { + return nil, err + } + + needGenesis := len(status.CommitSha) == 0 + if !needGenesis { + hasAncestorCmd := git.NewCommand(ctx, "merge-base").AddDynamicArguments(status.CommitSha, revision) + stdout, _, _ := hasAncestorCmd.RunStdString(&git.RunOpts{Dir: repo.RepoPath()}) + needGenesis = len(stdout) == 0 + } + + if needGenesis { + return genesisChanges(ctx, repo, revision) + } + return nonGenesisChanges(ctx, repo, revision) +} + +func isIndexable(entry *git.TreeEntry) bool { + if !entry.IsRegular() && !entry.IsExecutable() { + return false + } + name := strings.ToLower(entry.Name()) + for _, g := range setting.Indexer.ExcludePatterns { + if g.Match(name) { + return false + } + } + for _, g := range setting.Indexer.IncludePatterns { + if g.Match(name) { + return true + } + } + return len(setting.Indexer.IncludePatterns) == 0 +} + +// parseGitLsTreeOutput parses the output of a `git ls-tree -r --full-name` command +func parseGitLsTreeOutput(stdout []byte) ([]internal.FileUpdate, error) { + entries, err := git.ParseTreeEntries(stdout) + if err != nil { + return nil, err + } + idxCount := 0 + updates := make([]internal.FileUpdate, len(entries)) + for _, entry := range entries { + if isIndexable(entry) { + updates[idxCount] = internal.FileUpdate{ + Filename: entry.Name(), + BlobSha: entry.ID.String(), + Size: entry.Size(), + Sized: true, + } + idxCount++ + } + } + return updates[:idxCount], nil +} + +// genesisChanges get changes to add repo to the indexer for the first time +func genesisChanges(ctx context.Context, repo *repo_model.Repository, revision string) (*internal.RepoChanges, error) { + var changes internal.RepoChanges + stdout, _, runErr := git.NewCommand(ctx, "ls-tree", "--full-tree", "-l", "-r").AddDynamicArguments(revision).RunStdBytes(&git.RunOpts{Dir: repo.RepoPath()}) + if runErr != nil { + return nil, runErr + } + + var err error + changes.Updates, err = parseGitLsTreeOutput(stdout) + return &changes, err +} + +// nonGenesisChanges get changes since the previous indexer update +func nonGenesisChanges(ctx context.Context, repo *repo_model.Repository, revision string) (*internal.RepoChanges, error) { + diffCmd := git.NewCommand(ctx, "diff", "--name-status").AddDynamicArguments(repo.CodeIndexerStatus.CommitSha, revision) + stdout, _, runErr := diffCmd.RunStdString(&git.RunOpts{Dir: repo.RepoPath()}) + if runErr != nil { + // previous commit sha may have been removed by a force push, so + // try rebuilding from scratch + log.Warn("git diff: %v", runErr) + if err := (*globalIndexer.Load()).Delete(ctx, repo.ID); err != nil { + return nil, err + } + return genesisChanges(ctx, repo, revision) + } + + var changes internal.RepoChanges + var err error + updatedFilenames := make([]string, 0, 10) + + updateChanges := func() error { + cmd := git.NewCommand(ctx, "ls-tree", "--full-tree", "-l").AddDynamicArguments(revision). + AddDashesAndList(updatedFilenames...) + lsTreeStdout, _, err := cmd.RunStdBytes(&git.RunOpts{Dir: repo.RepoPath()}) + if err != nil { + return err + } + + updates, err1 := parseGitLsTreeOutput(lsTreeStdout) + if err1 != nil { + return err1 + } + changes.Updates = append(changes.Updates, updates...) + return nil + } + lines := strings.Split(stdout, "\n") + for _, line := range lines { + line = strings.TrimSpace(line) + if len(line) == 0 { + continue + } + fields := strings.Split(line, "\t") + if len(fields) < 2 { + log.Warn("Unparsable output for diff --name-status: `%s`)", line) + continue + } + filename := fields[1] + if len(filename) == 0 { + continue + } else if filename[0] == '"' { + filename, err = strconv.Unquote(filename) + if err != nil { + return nil, err + } + } + + switch status := fields[0][0]; status { + case 'M', 'A': + updatedFilenames = append(updatedFilenames, filename) + case 'D': + changes.RemovedFilenames = append(changes.RemovedFilenames, filename) + case 'R', 'C': + if len(fields) < 3 { + log.Warn("Unparsable output for diff --name-status: `%s`)", line) + continue + } + dest := fields[2] + if len(dest) == 0 { + log.Warn("Unparsable output for diff --name-status: `%s`)", line) + continue + } + if dest[0] == '"' { + dest, err = strconv.Unquote(dest) + if err != nil { + return nil, err + } + } + if status == 'R' { + changes.RemovedFilenames = append(changes.RemovedFilenames, filename) + } + updatedFilenames = append(updatedFilenames, dest) + default: + log.Warn("Unrecognized status: %c (line=%s)", status, line) + } + + // According to https://learn.microsoft.com/en-us/troubleshoot/windows-client/shell-experience/command-line-string-limitation#more-information + // the command line length should less than 8191 characters, assume filepath is 256, then 8191/256 = 31, so we use 30 + if len(updatedFilenames) >= 30 { + if err := updateChanges(); err != nil { + return nil, err + } + updatedFilenames = updatedFilenames[0:0] + } + } + + if len(updatedFilenames) > 0 { + if err := updateChanges(); err != nil { + return nil, err + } + } + + return &changes, err +} diff --git a/modules/indexer/code/indexer.go b/modules/indexer/code/indexer.go new file mode 100644 index 0000000..0a8ce27 --- /dev/null +++ b/modules/indexer/code/indexer.go @@ -0,0 +1,310 @@ +// Copyright 2016 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package code + +import ( + "context" + "os" + "runtime/pprof" + "slices" + "sync/atomic" + "time" + + "code.gitea.io/gitea/models/db" + repo_model "code.gitea.io/gitea/models/repo" + "code.gitea.io/gitea/modules/graceful" + "code.gitea.io/gitea/modules/indexer/code/bleve" + "code.gitea.io/gitea/modules/indexer/code/elasticsearch" + "code.gitea.io/gitea/modules/indexer/code/internal" + "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/process" + "code.gitea.io/gitea/modules/queue" + "code.gitea.io/gitea/modules/setting" +) + +var ( + indexerQueue *queue.WorkerPoolQueue[*internal.IndexerData] + // globalIndexer is the global indexer, it cannot be nil. + // When the real indexer is not ready, it will be a dummy indexer which will return error to explain it's not ready. + // So it's always safe use it as *globalIndexer.Load() and call its methods. + globalIndexer atomic.Pointer[internal.Indexer] + dummyIndexer *internal.Indexer +) + +func init() { + i := internal.NewDummyIndexer() + dummyIndexer = &i + globalIndexer.Store(dummyIndexer) +} + +func index(ctx context.Context, indexer internal.Indexer, repoID int64) error { + repo, err := repo_model.GetRepositoryByID(ctx, repoID) + if repo_model.IsErrRepoNotExist(err) { + return indexer.Delete(ctx, repoID) + } + if err != nil { + return err + } + + repoTypes := setting.Indexer.RepoIndexerRepoTypes + + if len(repoTypes) == 0 { + repoTypes = []string{"sources"} + } + + // skip forks from being indexed if unit is not present + if !slices.Contains(repoTypes, "forks") && repo.IsFork { + return nil + } + + // skip mirrors from being indexed if unit is not present + if !slices.Contains(repoTypes, "mirrors") && repo.IsMirror { + return nil + } + + // skip templates from being indexed if unit is not present + if !slices.Contains(repoTypes, "templates") && repo.IsTemplate { + return nil + } + + // skip regular repos from being indexed if unit is not present + if !slices.Contains(repoTypes, "sources") && !repo.IsFork && !repo.IsMirror && !repo.IsTemplate { + return nil + } + + sha, err := getDefaultBranchSha(ctx, repo) + if err != nil { + return err + } + changes, err := getRepoChanges(ctx, repo, sha) + if err != nil { + return err + } else if changes == nil { + return nil + } + + if err := indexer.Index(ctx, repo, sha, changes); err != nil { + return err + } + + return repo_model.UpdateIndexerStatus(ctx, repo, repo_model.RepoIndexerTypeCode, sha) +} + +// Init initialize the repo indexer +func Init() { + if !setting.Indexer.RepoIndexerEnabled { + (*globalIndexer.Load()).Close() + return + } + + ctx, cancel, finished := process.GetManager().AddTypedContext(context.Background(), "Service: CodeIndexer", process.SystemProcessType, false) + + graceful.GetManager().RunAtTerminate(func() { + select { + case <-ctx.Done(): + return + default: + } + cancel() + log.Debug("Closing repository indexer") + (*globalIndexer.Load()).Close() + log.Info("PID: %d Repository Indexer closed", os.Getpid()) + finished() + }) + + waitChannel := make(chan time.Duration, 1) + + // Create the Queue + switch setting.Indexer.RepoType { + case "bleve", "elasticsearch": + handler := func(items ...*internal.IndexerData) (unhandled []*internal.IndexerData) { + indexer := *globalIndexer.Load() + // make it a process to allow for cancellation (especially during integration tests where no global shutdown happens) + batchCtx, _, finished := process.GetManager().AddContext(ctx, "CodeIndexer batch") + defer finished() + for _, indexerData := range items { + log.Trace("IndexerData Process Repo: %d", indexerData.RepoID) + if err := index(batchCtx, indexer, indexerData.RepoID); err != nil { + unhandled = append(unhandled, indexerData) + if !setting.IsInTesting { + log.Error("Codes indexer handler: index error for repo %v: %v", indexerData.RepoID, err) + } + } + } + return unhandled + } + + indexerQueue = queue.CreateUniqueQueue(ctx, "code_indexer", handler) + if indexerQueue == nil { + log.Fatal("Unable to create codes indexer queue") + } + default: + log.Fatal("Unknown codes indexer type; %s", setting.Indexer.RepoType) + } + + go func() { + pprof.SetGoroutineLabels(ctx) + start := time.Now() + var ( + rIndexer internal.Indexer + existed bool + err error + ) + switch setting.Indexer.RepoType { + case "bleve": + log.Info("PID: %d Initializing Repository Indexer at: %s", os.Getpid(), setting.Indexer.RepoPath) + defer func() { + if err := recover(); err != nil { + log.Error("PANIC whilst initializing repository indexer: %v\nStacktrace: %s", err, log.Stack(2)) + log.Error("The indexer files are likely corrupted and may need to be deleted") + log.Error("You can completely remove the \"%s\" directory to make Forgejo recreate the indexes", setting.Indexer.RepoPath) + } + }() + + rIndexer = bleve.NewIndexer(setting.Indexer.RepoPath) + existed, err = rIndexer.Init(ctx) + if err != nil { + cancel() + (*globalIndexer.Load()).Close() + close(waitChannel) + log.Fatal("PID: %d Unable to initialize the bleve Repository Indexer at path: %s Error: %v", os.Getpid(), setting.Indexer.RepoPath, err) + } + case "elasticsearch": + log.Info("PID: %d Initializing Repository Indexer at: %s", os.Getpid(), setting.Indexer.RepoConnStr) + defer func() { + if err := recover(); err != nil { + log.Error("PANIC whilst initializing repository indexer: %v\nStacktrace: %s", err, log.Stack(2)) + log.Error("The indexer files are likely corrupted and may need to be deleted") + log.Error("You can completely remove the \"%s\" index to make Forgejo recreate the indexes", setting.Indexer.RepoConnStr) + } + }() + + rIndexer = elasticsearch.NewIndexer(setting.Indexer.RepoConnStr, setting.Indexer.RepoIndexerName) + existed, err = rIndexer.Init(ctx) + if err != nil { + cancel() + (*globalIndexer.Load()).Close() + close(waitChannel) + log.Fatal("PID: %d Unable to initialize the elasticsearch Repository Indexer connstr: %s Error: %v", os.Getpid(), setting.Indexer.RepoConnStr, err) + } + + default: + log.Fatal("PID: %d Unknown Indexer type: %s", os.Getpid(), setting.Indexer.RepoType) + } + + globalIndexer.Store(&rIndexer) + + // Start processing the queue + go graceful.GetManager().RunWithCancel(indexerQueue) + + if !existed { // populate the index because it's created for the first time + go graceful.GetManager().RunWithShutdownContext(populateRepoIndexer) + } + select { + case waitChannel <- time.Since(start): + case <-graceful.GetManager().IsShutdown(): + } + + close(waitChannel) + }() + + if setting.Indexer.StartupTimeout > 0 { + go func() { + pprof.SetGoroutineLabels(ctx) + timeout := setting.Indexer.StartupTimeout + if graceful.GetManager().IsChild() && setting.GracefulHammerTime > 0 { + timeout += setting.GracefulHammerTime + } + select { + case <-graceful.GetManager().IsShutdown(): + log.Warn("Shutdown before Repository Indexer completed initialization") + cancel() + (*globalIndexer.Load()).Close() + case duration, ok := <-waitChannel: + if !ok { + log.Warn("Repository Indexer Initialization failed") + cancel() + (*globalIndexer.Load()).Close() + return + } + log.Info("Repository Indexer Initialization took %v", duration) + case <-time.After(timeout): + cancel() + (*globalIndexer.Load()).Close() + log.Fatal("Repository Indexer Initialization Timed-Out after: %v", timeout) + } + }() + } +} + +// UpdateRepoIndexer update a repository's entries in the indexer +func UpdateRepoIndexer(repo *repo_model.Repository) { + indexData := &internal.IndexerData{RepoID: repo.ID} + if err := indexerQueue.Push(indexData); err != nil { + log.Error("Update repo index data %v failed: %v", indexData, err) + } +} + +// IsAvailable checks if issue indexer is available +func IsAvailable(ctx context.Context) bool { + return (*globalIndexer.Load()).Ping(ctx) == nil +} + +// populateRepoIndexer populate the repo indexer with pre-existing data. This +// should only be run when the indexer is created for the first time. +func populateRepoIndexer(ctx context.Context) { + log.Info("Populating the repo indexer with existing repositories") + + exist, err := db.IsTableNotEmpty("repository") + if err != nil { + log.Fatal("System error: %v", err) + } else if !exist { + return + } + + // if there is any existing repo indexer metadata in the DB, delete it + // since we are starting afresh. Also, xorm requires deletes to have a + // condition, and we want to delete everything, thus 1=1. + if err := db.DeleteAllRecords("repo_indexer_status"); err != nil { + log.Fatal("System error: %v", err) + } + + var maxRepoID int64 + if maxRepoID, err = db.GetMaxID("repository"); err != nil { + log.Fatal("System error: %v", err) + } + + // start with the maximum existing repo ID and work backwards, so that we + // don't include repos that are created after gitea starts; such repos will + // already be added to the indexer, and we don't need to add them again. + for maxRepoID > 0 { + select { + case <-ctx.Done(): + log.Info("Repository Indexer population shutdown before completion") + return + default: + } + ids, err := repo_model.GetUnindexedRepos(ctx, repo_model.RepoIndexerTypeCode, maxRepoID, 0, 50) + if err != nil { + log.Error("populateRepoIndexer: %v", err) + return + } else if len(ids) == 0 { + break + } + for _, id := range ids { + select { + case <-ctx.Done(): + log.Info("Repository Indexer population shutdown before completion") + return + default: + } + if err := indexerQueue.Push(&internal.IndexerData{RepoID: id}); err != nil { + log.Error("indexerQueue.Push: %v", err) + return + } + maxRepoID = id - 1 + } + } + log.Info("Done (re)populating the repo indexer with existing repositories") +} diff --git a/modules/indexer/code/indexer_test.go b/modules/indexer/code/indexer_test.go new file mode 100644 index 0000000..967aad1 --- /dev/null +++ b/modules/indexer/code/indexer_test.go @@ -0,0 +1,145 @@ +// Copyright 2020 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package code + +import ( + "context" + "os" + "testing" + + "code.gitea.io/gitea/models/db" + "code.gitea.io/gitea/models/unittest" + "code.gitea.io/gitea/modules/git" + "code.gitea.io/gitea/modules/indexer/code/bleve" + "code.gitea.io/gitea/modules/indexer/code/elasticsearch" + "code.gitea.io/gitea/modules/indexer/code/internal" + + _ "code.gitea.io/gitea/models" + _ "code.gitea.io/gitea/models/actions" + _ "code.gitea.io/gitea/models/activities" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestMain(m *testing.M) { + unittest.MainTest(m) +} + +func testIndexer(name string, t *testing.T, indexer internal.Indexer) { + t.Run(name, func(t *testing.T) { + var repoID int64 = 1 + err := index(git.DefaultContext, indexer, repoID) + require.NoError(t, err) + keywords := []struct { + RepoIDs []int64 + Keyword string + IDs []int64 + Langs int + }{ + { + RepoIDs: nil, + Keyword: "Description", + IDs: []int64{repoID}, + Langs: 1, + }, + { + RepoIDs: []int64{2}, + Keyword: "Description", + IDs: []int64{}, + Langs: 0, + }, + { + RepoIDs: nil, + Keyword: "Description for", + IDs: []int64{repoID}, + Langs: 1, + }, + { + RepoIDs: nil, + Keyword: "repo1", + IDs: []int64{repoID}, + Langs: 1, + }, + { + RepoIDs: []int64{2}, + Keyword: "repo1", + IDs: []int64{}, + Langs: 0, + }, + { + RepoIDs: nil, + Keyword: "non-exist", + IDs: []int64{}, + Langs: 0, + }, + } + + for _, kw := range keywords { + t.Run(kw.Keyword, func(t *testing.T) { + total, res, langs, err := indexer.Search(context.TODO(), &internal.SearchOptions{ + RepoIDs: kw.RepoIDs, + Keyword: kw.Keyword, + Paginator: &db.ListOptions{ + Page: 1, + PageSize: 10, + }, + IsKeywordFuzzy: true, + }) + require.NoError(t, err) + assert.Len(t, kw.IDs, int(total)) + assert.Len(t, langs, kw.Langs) + + ids := make([]int64, 0, len(res)) + for _, hit := range res { + ids = append(ids, hit.RepoID) + assert.EqualValues(t, "# repo1\n\nDescription for repo1", hit.Content) + } + assert.EqualValues(t, kw.IDs, ids) + }) + } + + require.NoError(t, indexer.Delete(context.Background(), repoID)) + }) +} + +func TestBleveIndexAndSearch(t *testing.T) { + unittest.PrepareTestEnv(t) + + dir := t.TempDir() + + idx := bleve.NewIndexer(dir) + _, err := idx.Init(context.Background()) + if err != nil { + if idx != nil { + idx.Close() + } + assert.FailNow(t, "Unable to create bleve indexer Error: %v", err) + } + defer idx.Close() + + testIndexer("bleve", t, idx) +} + +func TestESIndexAndSearch(t *testing.T) { + unittest.PrepareTestEnv(t) + + u := os.Getenv("TEST_INDEXER_CODE_ES_URL") + if u == "" { + t.SkipNow() + return + } + + indexer := elasticsearch.NewIndexer(u, "gitea_codes") + if _, err := indexer.Init(context.Background()); err != nil { + if indexer != nil { + indexer.Close() + } + assert.FailNow(t, "Unable to init ES indexer Error: %v", err) + } + + defer indexer.Close() + + testIndexer("elastic_search", t, indexer) +} diff --git a/modules/indexer/code/internal/indexer.go b/modules/indexer/code/internal/indexer.go new file mode 100644 index 0000000..c259fcd --- /dev/null +++ b/modules/indexer/code/internal/indexer.go @@ -0,0 +1,54 @@ +// Copyright 2023 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package internal + +import ( + "context" + "fmt" + + "code.gitea.io/gitea/models/db" + repo_model "code.gitea.io/gitea/models/repo" + "code.gitea.io/gitea/modules/indexer/internal" +) + +// Indexer defines an interface to index and search code contents +type Indexer interface { + internal.Indexer + Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *RepoChanges) error + Delete(ctx context.Context, repoID int64) error + Search(ctx context.Context, opts *SearchOptions) (int64, []*SearchResult, []*SearchResultLanguages, error) +} + +type SearchOptions struct { + RepoIDs []int64 + Keyword string + Language string + + IsKeywordFuzzy bool + + db.Paginator +} + +// NewDummyIndexer returns a dummy indexer +func NewDummyIndexer() Indexer { + return &dummyIndexer{ + Indexer: internal.NewDummyIndexer(), + } +} + +type dummyIndexer struct { + internal.Indexer +} + +func (d *dummyIndexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *RepoChanges) error { + return fmt.Errorf("indexer is not ready") +} + +func (d *dummyIndexer) Delete(ctx context.Context, repoID int64) error { + return fmt.Errorf("indexer is not ready") +} + +func (d *dummyIndexer) Search(ctx context.Context, opts *SearchOptions) (int64, []*SearchResult, []*SearchResultLanguages, error) { + return 0, nil, nil, fmt.Errorf("indexer is not ready") +} diff --git a/modules/indexer/code/internal/model.go b/modules/indexer/code/internal/model.go new file mode 100644 index 0000000..f75263c --- /dev/null +++ b/modules/indexer/code/internal/model.go @@ -0,0 +1,44 @@ +// Copyright 2023 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package internal + +import "code.gitea.io/gitea/modules/timeutil" + +type FileUpdate struct { + Filename string + BlobSha string + Size int64 + Sized bool +} + +// RepoChanges changes (file additions/updates/removals) to a repo +type RepoChanges struct { + Updates []FileUpdate + RemovedFilenames []string +} + +// IndexerData represents data stored in the code indexer +type IndexerData struct { + RepoID int64 +} + +// SearchResult result of performing a search in a repo +type SearchResult struct { + RepoID int64 + StartIndex int + EndIndex int + Filename string + Content string + CommitID string + UpdatedUnix timeutil.TimeStamp + Language string + Color string +} + +// SearchResultLanguages result of top languages count in search results +type SearchResultLanguages struct { + Language string + Color string + Count int +} diff --git a/modules/indexer/code/internal/util.go b/modules/indexer/code/internal/util.go new file mode 100644 index 0000000..689c4f4 --- /dev/null +++ b/modules/indexer/code/internal/util.go @@ -0,0 +1,32 @@ +// Copyright 2023 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package internal + +import ( + "strings" + + "code.gitea.io/gitea/modules/indexer/internal" + "code.gitea.io/gitea/modules/log" +) + +func FilenameIndexerID(repoID int64, filename string) string { + return internal.Base36(repoID) + "_" + filename +} + +func ParseIndexerID(indexerID string) (int64, string) { + index := strings.IndexByte(indexerID, '_') + if index == -1 { + log.Error("Unexpected ID in repo indexer: %s", indexerID) + } + repoID, _ := internal.ParseBase36(indexerID[:index]) + return repoID, indexerID[index+1:] +} + +func FilenameOfIndexerID(indexerID string) string { + index := strings.IndexByte(indexerID, '_') + if index == -1 { + log.Error("Unexpected ID in repo indexer: %s", indexerID) + } + return indexerID[index+1:] +} diff --git a/modules/indexer/code/search.go b/modules/indexer/code/search.go new file mode 100644 index 0000000..f45907a --- /dev/null +++ b/modules/indexer/code/search.go @@ -0,0 +1,228 @@ +// Copyright 2017 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package code + +import ( + "bytes" + "context" + "html/template" + "strings" + + "code.gitea.io/gitea/modules/highlight" + "code.gitea.io/gitea/modules/indexer/code/internal" + "code.gitea.io/gitea/modules/timeutil" + "code.gitea.io/gitea/services/gitdiff" +) + +// Result a search result to display +type Result struct { + RepoID int64 + Filename string + CommitID string + UpdatedUnix timeutil.TimeStamp + Language string + Color string + Lines []ResultLine +} + +type ResultLine struct { + Num int + FormattedContent template.HTML +} + +type SearchResultLanguages = internal.SearchResultLanguages + +type SearchOptions = internal.SearchOptions + +func indices(content string, selectionStartIndex, selectionEndIndex int) (int, int) { + startIndex := selectionStartIndex + numLinesBefore := 0 + for ; startIndex > 0; startIndex-- { + if content[startIndex-1] == '\n' { + if numLinesBefore == 1 { + break + } + numLinesBefore++ + } + } + + endIndex := selectionEndIndex + numLinesAfter := 0 + for ; endIndex < len(content); endIndex++ { + if content[endIndex] == '\n' { + if numLinesAfter == 1 { + break + } + numLinesAfter++ + } + } + + return startIndex, endIndex +} + +func writeStrings(buf *bytes.Buffer, strs ...string) error { + for _, s := range strs { + _, err := buf.WriteString(s) + if err != nil { + return err + } + } + return nil +} + +const ( + highlightTagStart = "<span class=\"search-highlight\">" + highlightTagEnd = "</span>" +) + +func HighlightSearchResultCode(filename string, lineNums []int, highlightRanges [][3]int, code string) []ResultLine { + hcd := gitdiff.NewHighlightCodeDiff() + hcd.CollectUsedRunes(code) + startTag, endTag := hcd.NextPlaceholder(), hcd.NextPlaceholder() + hcd.PlaceholderTokenMap[startTag] = highlightTagStart + hcd.PlaceholderTokenMap[endTag] = highlightTagEnd + + // we should highlight the whole code block first, otherwise it doesn't work well with multiple line highlighting + hl, _ := highlight.Code(filename, "", code) + conv := hcd.ConvertToPlaceholders(string(hl)) + convLines := strings.Split(conv, "\n") + + // each highlightRange is of the form [line number, start pos, end pos] + for _, highlightRange := range highlightRanges { + ln, start, end := highlightRange[0], highlightRange[1], highlightRange[2] + line := convLines[ln] + if line == "" || len(line) <= start || len(line) < end { + continue + } + + sb := strings.Builder{} + count := -1 + isOpen := false + for _, r := range line { + if token, ok := hcd.PlaceholderTokenMap[r]; + // token was not found + !ok || + // token was marked as used + token == "" || + // the token is not an valid html tag emitted by chroma + !(len(token) > 6 && (token[0:5] == "<span" || token[0:6] == "</span")) { + count++ + } else if !isOpen { + // open the tag only after all other placeholders + sb.WriteRune(r) + continue + } else if isOpen && count < end { + // if the tag is open, but a placeholder exists in between + // close the tag + sb.WriteRune(endTag) + // write the placeholder + sb.WriteRune(r) + // reopen the tag + sb.WriteRune(startTag) + continue + } + + switch count { + case end: + // if tag is not open, no need to close + if !isOpen { + break + } + sb.WriteRune(endTag) + isOpen = false + case start: + // if tag is open, do not open again + if isOpen { + break + } + isOpen = true + sb.WriteRune(startTag) + } + + sb.WriteRune(r) + } + if isOpen { + sb.WriteRune(endTag) + } + convLines[ln] = sb.String() + } + conv = strings.Join(convLines, "\n") + + highlightedLines := strings.Split(hcd.Recover(conv), "\n") + // The lineNums outputted by highlight.Code might not match the original lineNums, because "highlight" removes the last `\n` + lines := make([]ResultLine, min(len(highlightedLines), len(lineNums))) + for i := 0; i < len(lines); i++ { + lines[i].Num = lineNums[i] + lines[i].FormattedContent = template.HTML(highlightedLines[i]) + } + return lines +} + +func searchResult(result *internal.SearchResult, startIndex, endIndex int) (*Result, error) { + startLineNum := 1 + strings.Count(result.Content[:startIndex], "\n") + + var formattedLinesBuffer bytes.Buffer + + contentLines := strings.SplitAfter(result.Content[startIndex:endIndex], "\n") + lineNums := make([]int, 0, len(contentLines)) + index := startIndex + var highlightRanges [][3]int + for i, line := range contentLines { + var err error + if index < result.EndIndex && + result.StartIndex < index+len(line) && + result.StartIndex < result.EndIndex { + openActiveIndex := max(result.StartIndex-index, 0) + closeActiveIndex := min(result.EndIndex-index, len(line)) + highlightRanges = append(highlightRanges, [3]int{i, openActiveIndex, closeActiveIndex}) + err = writeStrings(&formattedLinesBuffer, + line[:openActiveIndex], + line[openActiveIndex:closeActiveIndex], + line[closeActiveIndex:], + ) + } else { + err = writeStrings(&formattedLinesBuffer, line) + } + if err != nil { + return nil, err + } + + lineNums = append(lineNums, startLineNum+i) + index += len(line) + } + + return &Result{ + RepoID: result.RepoID, + Filename: result.Filename, + CommitID: result.CommitID, + UpdatedUnix: result.UpdatedUnix, + Language: result.Language, + Color: result.Color, + Lines: HighlightSearchResultCode(result.Filename, lineNums, highlightRanges, formattedLinesBuffer.String()), + }, nil +} + +// PerformSearch perform a search on a repository +// if isFuzzy is true set the Damerau-Levenshtein distance from 0 to 2 +func PerformSearch(ctx context.Context, opts *SearchOptions) (int, []*Result, []*SearchResultLanguages, error) { + if opts == nil || len(opts.Keyword) == 0 { + return 0, nil, nil, nil + } + + total, results, resultLanguages, err := (*globalIndexer.Load()).Search(ctx, opts) + if err != nil { + return 0, nil, nil, err + } + + displayResults := make([]*Result, len(results)) + + for i, result := range results { + startIndex, endIndex := indices(result.Content, result.StartIndex, result.EndIndex) + displayResults[i], err = searchResult(result, startIndex, endIndex) + if err != nil { + return 0, nil, nil, err + } + } + return int(total), displayResults, resultLanguages, nil +} |