diff options
Diffstat (limited to 'services/gitdiff/highlightdiff.go')
-rw-r--r-- | services/gitdiff/highlightdiff.go | 227 |
1 files changed, 227 insertions, 0 deletions
diff --git a/services/gitdiff/highlightdiff.go b/services/gitdiff/highlightdiff.go new file mode 100644 index 0000000..c72959e --- /dev/null +++ b/services/gitdiff/highlightdiff.go @@ -0,0 +1,227 @@ +// Copyright 2022 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package gitdiff + +import ( + "strings" + + "code.gitea.io/gitea/modules/highlight" + + "github.com/sergi/go-diff/diffmatchpatch" +) + +// token is a html tag or entity, eg: "<span ...>", "</span>", "<" +func extractHTMLToken(s string) (before, token, after string, valid bool) { + for pos1 := 0; pos1 < len(s); pos1++ { + if s[pos1] == '<' { + pos2 := strings.IndexByte(s[pos1:], '>') + if pos2 == -1 { + return "", "", s, false + } + return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true + } else if s[pos1] == '&' { + pos2 := strings.IndexByte(s[pos1:], ';') + if pos2 == -1 { + return "", "", s, false + } + return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true + } + } + return "", "", s, true +} + +// HighlightCodeDiff is used to do diff with highlighted HTML code. +// It totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes. +// The HTML tags and entities will be replaced by Unicode placeholders: "<span>{TEXT}</span>" => "\uE000{TEXT}\uE001" +// These Unicode placeholders are friendly to the diff. +// Then after diff, the placeholders in diff result will be recovered to the HTML tags and entities. +// It's guaranteed that the tags in final diff result are paired correctly. +type HighlightCodeDiff struct { + placeholderBegin rune + placeholderMaxCount int + placeholderIndex int + PlaceholderTokenMap map[rune]string + tokenPlaceholderMap map[string]rune + + placeholderOverflowCount int + + lineWrapperTags []string +} + +func NewHighlightCodeDiff() *HighlightCodeDiff { + return &HighlightCodeDiff{ + placeholderBegin: rune(0x100000), // Plane 16: Supplementary Private Use Area B (U+100000..U+10FFFD) + placeholderMaxCount: 64000, + PlaceholderTokenMap: map[rune]string{}, + tokenPlaceholderMap: map[string]rune{}, + } +} + +// NextPlaceholder returns 0 if no more placeholder can be used +// the diff is done line by line, usually there are only a few (no more than 10) placeholders in one line +// so the placeholderMaxCount is impossible to be exhausted in real cases. +func (hcd *HighlightCodeDiff) NextPlaceholder() rune { + for hcd.placeholderIndex < hcd.placeholderMaxCount { + r := hcd.placeholderBegin + rune(hcd.placeholderIndex) + hcd.placeholderIndex++ + // only use non-existing (not used by code) rune as placeholders + if _, ok := hcd.PlaceholderTokenMap[r]; !ok { + return r + } + } + return 0 // no more available placeholder +} + +func (hcd *HighlightCodeDiff) isInPlaceholderRange(r rune) bool { + return hcd.placeholderBegin <= r && r < hcd.placeholderBegin+rune(hcd.placeholderMaxCount) +} + +func (hcd *HighlightCodeDiff) CollectUsedRunes(code string) { + for _, r := range code { + if hcd.isInPlaceholderRange(r) { + // put the existing rune (used by code) in map, then this rune won't be used a placeholder anymore. + hcd.PlaceholderTokenMap[r] = "" + } + } +} + +func (hcd *HighlightCodeDiff) diffWithHighlight(filename, language, codeA, codeB string) []diffmatchpatch.Diff { + hcd.CollectUsedRunes(codeA) + hcd.CollectUsedRunes(codeB) + + highlightCodeA, _ := highlight.Code(filename, language, codeA) + highlightCodeB, _ := highlight.Code(filename, language, codeB) + + convertedCodeA := hcd.ConvertToPlaceholders(string(highlightCodeA)) + convertedCodeB := hcd.ConvertToPlaceholders(string(highlightCodeB)) + + diffs := diffMatchPatch.DiffMain(convertedCodeA, convertedCodeB, true) + diffs = diffMatchPatch.DiffCleanupSemantic(diffs) + diffs = diffMatchPatch.DiffCleanupEfficiency(diffs) + + for i := range diffs { + hcd.recoverOneDiff(&diffs[i]) + } + return diffs +} + +// convertToPlaceholders totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes. +func (hcd *HighlightCodeDiff) ConvertToPlaceholders(htmlCode string) string { + var tagStack []string + res := strings.Builder{} + + firstRunForLineTags := hcd.lineWrapperTags == nil + + var beforeToken, token string + var valid bool + + // the standard chroma highlight HTML is "<span class="line [hl]"><span class="cl"> ... </span></span>" + for { + beforeToken, token, htmlCode, valid = extractHTMLToken(htmlCode) + if !valid || token == "" { + break + } + // write the content before the token into result string, and consume the token in the string + res.WriteString(beforeToken) + + // the line wrapper tags should be removed before diff + if strings.HasPrefix(token, `<span class="line`) || strings.HasPrefix(token, `<span class="cl"`) { + if firstRunForLineTags { + // if this is the first run for converting, save the line wrapper tags for later use, they should be added back + hcd.lineWrapperTags = append(hcd.lineWrapperTags, token) + } + htmlCode = strings.TrimSuffix(htmlCode, "</span>") + continue + } + + var tokenInMap string + if strings.HasSuffix(token, "</") { // for closing tag + if len(tagStack) == 0 { + break // invalid diff result, no opening tag but see closing tag + } + // make sure the closing tag in map is related to the open tag, to make the diff algorithm can match the opening/closing tags + // the closing tag will be recorded in the map by key "</span><!-- <span the-opening> -->" for "<span the-opening>" + tokenInMap = token + "<!-- " + tagStack[len(tagStack)-1] + "-->" + tagStack = tagStack[:len(tagStack)-1] + } else if token[0] == '<' { // for opening tag + tokenInMap = token + tagStack = append(tagStack, token) + } else if token[0] == '&' { // for html entity + tokenInMap = token + } // else: impossible + + // remember the placeholder and token in the map + placeholder, ok := hcd.tokenPlaceholderMap[tokenInMap] + if !ok { + placeholder = hcd.NextPlaceholder() + if placeholder != 0 { + hcd.tokenPlaceholderMap[tokenInMap] = placeholder + hcd.PlaceholderTokenMap[placeholder] = tokenInMap + } + } + + if placeholder != 0 { + res.WriteRune(placeholder) // use the placeholder to replace the token + } else { + // unfortunately, all private use runes has been exhausted, no more placeholder could be used, no more converting + // usually, the exhausting won't occur in real cases, the magnitude of used placeholders is not larger than that of the CSS classes outputted by chroma. + hcd.placeholderOverflowCount++ + if strings.HasPrefix(token, "&") { + // when the token is a html entity, something must be outputted even if there is no placeholder. + res.WriteRune(0xFFFD) // replacement character TODO: how to handle this case more gracefully? + res.WriteString(token[1:]) // still output the entity code part, otherwise there will be no diff result. + } + } + } + + // write the remaining string + res.WriteString(htmlCode) + return res.String() +} + +func (hcd *HighlightCodeDiff) recoverOneDiff(diff *diffmatchpatch.Diff) { + diff.Text = hcd.Recover(diff.Text) +} + +func (hcd *HighlightCodeDiff) Recover(src string) string { + sb := strings.Builder{} + var tagStack []string + + for _, r := range src { + token, ok := hcd.PlaceholderTokenMap[r] + if !ok || token == "" { + sb.WriteRune(r) // if the rune is not a placeholder, write it as it is + continue + } + var tokenToRecover string + if strings.HasPrefix(token, "</") { // for closing tag + // only get the tag itself, ignore the trailing comment (for how the comment is generated, see the code in `convert` function) + tokenToRecover = token[:strings.IndexByte(token, '>')+1] + if len(tagStack) == 0 { + continue // if no opening tag in stack yet, skip the closing tag + } + tagStack = tagStack[:len(tagStack)-1] + } else if token[0] == '<' { // for opening tag + tokenToRecover = token + tagStack = append(tagStack, token) + } else if token[0] == '&' { // for html entity + tokenToRecover = token + } // else: impossible + sb.WriteString(tokenToRecover) + } + + if len(tagStack) > 0 { + // close all opening tags + for i := len(tagStack) - 1; i >= 0; i-- { + tagToClose := tagStack[i] + // get the closing tag "</span>" from "<span class=...>" or "<span>" + pos := strings.IndexAny(tagToClose, " >") + if pos != -1 { + sb.WriteString("</" + tagToClose[1:pos] + ">") + } // else: impossible. every tag was pushed into the stack by the code above and is valid HTML opening tag + } + } + + return sb.String() +} |