1 files changed, 227 insertions, 0 deletions
diff --git a/services/gitdiff/highlightdiff.go b/services/gitdiff/highlightdiff.go
new file mode 100644
index 0000000..c72959e
--- /dev/null
+++ b/services/gitdiff/highlightdiff.go
@@ -0,0 +1,227 @@
+// Copyright 2022 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package gitdiff
+
+import (
+	"strings"
+
+	"code.gitea.io/gitea/modules/highlight"
+
+	"github.com/sergi/go-diff/diffmatchpatch"
+)
+
+// token is a html tag or entity, eg: "<span ...>", "</span>", "&lt;"
+func extractHTMLToken(s string) (before, token, after string, valid bool) {
+	for pos1 := 0; pos1 < len(s); pos1++ {
+		if s[pos1] == '<' {
+			pos2 := strings.IndexByte(s[pos1:], '>')
+			if pos2 == -1 {
+				return "", "", s, false
+			}
+			return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true
+		} else if s[pos1] == '&' {
+			pos2 := strings.IndexByte(s[pos1:], ';')
+			if pos2 == -1 {
+				return "", "", s, false
+			}
+			return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true
+		}
+	}
+	return "", "", s, true
+}
+
+// HighlightCodeDiff is used to do diff with highlighted HTML code.
+// It totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes.
+// The HTML tags and entities will be replaced by Unicode placeholders: "<span>{TEXT}</span>" => "\uE000{TEXT}\uE001"
+// These Unicode placeholders are friendly to the diff.
+// Then after diff, the placeholders in diff result will be recovered to the HTML tags and entities.
+// It's guaranteed that the tags in final diff result are paired correctly.
+type HighlightCodeDiff struct {
+	placeholderBegin    rune
+	placeholderMaxCount int
+	placeholderIndex    int
+	PlaceholderTokenMap map[rune]string
+	tokenPlaceholderMap map[string]rune
+
+	placeholderOverflowCount int
+
+	lineWrapperTags []string
+}
+
+func NewHighlightCodeDiff() *HighlightCodeDiff {
+	return &HighlightCodeDiff{
+		placeholderBegin:    rune(0x100000), // Plane 16: Supplementary Private Use Area B (U+100000..U+10FFFD)
+		placeholderMaxCount: 64000,
+		PlaceholderTokenMap: map[rune]string{},
+		tokenPlaceholderMap: map[string]rune{},
+	}
+}
+
+// NextPlaceholder returns 0 if no more placeholder can be used
+// the diff is done line by line, usually there are only a few (no more than 10) placeholders in one line
+// so the placeholderMaxCount is impossible to be exhausted in real cases.
+func (hcd *HighlightCodeDiff) NextPlaceholder() rune {
+	for hcd.placeholderIndex < hcd.placeholderMaxCount {
+		r := hcd.placeholderBegin + rune(hcd.placeholderIndex)
+		hcd.placeholderIndex++
+		// only use non-existing (not used by code) rune as placeholders
+		if _, ok := hcd.PlaceholderTokenMap[r]; !ok {
+			return r
+		}
+	}
+	return 0 // no more available placeholder
+}
+
+func (hcd *HighlightCodeDiff) isInPlaceholderRange(r rune) bool {
+	return hcd.placeholderBegin <= r && r < hcd.placeholderBegin+rune(hcd.placeholderMaxCount)
+}
+
+func (hcd *HighlightCodeDiff) CollectUsedRunes(code string) {
+	for _, r := range code {
+		if hcd.isInPlaceholderRange(r) {
+			// put the existing rune (used by code) in map, then this rune won't be used a placeholder anymore.
+			hcd.PlaceholderTokenMap[r] = ""
+		}
+	}
+}
+
+func (hcd *HighlightCodeDiff) diffWithHighlight(filename, language, codeA, codeB string) []diffmatchpatch.Diff {
+	hcd.CollectUsedRunes(codeA)
+	hcd.CollectUsedRunes(codeB)
+
+	highlightCodeA, _ := highlight.Code(filename, language, codeA)
+	highlightCodeB, _ := highlight.Code(filename, language, codeB)
+
+	convertedCodeA := hcd.ConvertToPlaceholders(string(highlightCodeA))
+	convertedCodeB := hcd.ConvertToPlaceholders(string(highlightCodeB))
+
+	diffs := diffMatchPatch.DiffMain(convertedCodeA, convertedCodeB, true)
+	diffs = diffMatchPatch.DiffCleanupSemantic(diffs)
+	diffs = diffMatchPatch.DiffCleanupEfficiency(diffs)
+
+	for i := range diffs {
+		hcd.recoverOneDiff(&diffs[i])
+	}
+	return diffs
+}
+
+// convertToPlaceholders totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes.
+func (hcd *HighlightCodeDiff) ConvertToPlaceholders(htmlCode string) string {
+	var tagStack []string
+	res := strings.Builder{}
+
+	firstRunForLineTags := hcd.lineWrapperTags == nil
+
+	var beforeToken, token string
+	var valid bool
+
+	// the standard chroma highlight HTML is "<span class="line [hl]"><span class="cl"> ... </span></span>"
+	for {
+		beforeToken, token, htmlCode, valid = extractHTMLToken(htmlCode)
+		if !valid || token == "" {
+			break
+		}
+		// write the content before the token into result string, and consume the token in the string
+		res.WriteString(beforeToken)
+
+		// the line wrapper tags should be removed before diff
+		if strings.HasPrefix(token, `<span class="line`) || strings.HasPrefix(token, `<span class="cl"`) {
+			if firstRunForLineTags {
+				// if this is the first run for converting, save the line wrapper tags for later use, they should be added back
+				hcd.lineWrapperTags = append(hcd.lineWrapperTags, token)
+			}
+			htmlCode = strings.TrimSuffix(htmlCode, "</span>")
+			continue
+		}
+
+		var tokenInMap string
+		if strings.HasSuffix(token, "</") { // for closing tag
+			if len(tagStack) == 0 {
+				break // invalid diff result, no opening tag but see closing tag
+			}
+			// make sure the closing tag in map is related to the open tag, to make the diff algorithm can match the opening/closing tags
+			// the closing tag will be recorded in the map by key "</span><!-- <span the-opening> -->" for "<span the-opening>"
+			tokenInMap = token + "<!-- " + tagStack[len(tagStack)-1] + "-->"
+			tagStack = tagStack[:len(tagStack)-1]
+		} else if token[0] == '<' { // for opening tag
+			tokenInMap = token
+			tagStack = append(tagStack, token)
+		} else if token[0] == '&' { // for html entity
+			tokenInMap = token
+		} // else: impossible
+
+		// remember the placeholder and token in the map
+		placeholder, ok := hcd.tokenPlaceholderMap[tokenInMap]
+		if !ok {
+			placeholder = hcd.NextPlaceholder()
+			if placeholder != 0 {
+				hcd.tokenPlaceholderMap[tokenInMap] = placeholder
+				hcd.PlaceholderTokenMap[placeholder] = tokenInMap
+			}
+		}
+
+		if placeholder != 0 {
+			res.WriteRune(placeholder) // use the placeholder to replace the token
+		} else {
+			// unfortunately, all private use runes has been exhausted, no more placeholder could be used, no more converting
+			// usually, the exhausting won't occur in real cases, the magnitude of used placeholders is not larger than that of the CSS classes outputted by chroma.
+			hcd.placeholderOverflowCount++
+			if strings.HasPrefix(token, "&") {
+				// when the token is a html entity, something must be outputted even if there is no placeholder.
+				res.WriteRune(0xFFFD)      // replacement character TODO: how to handle this case more gracefully?
+				res.WriteString(token[1:]) // still output the entity code part, otherwise there will be no diff result.
+			}
+		}
+	}
+
+	// write the remaining string
+	res.WriteString(htmlCode)
+	return res.String()
+}
+
+func (hcd *HighlightCodeDiff) recoverOneDiff(diff *diffmatchpatch.Diff) {
+	diff.Text = hcd.Recover(diff.Text)
+}
+
+func (hcd *HighlightCodeDiff) Recover(src string) string {
+	sb := strings.Builder{}
+	var tagStack []string
+
+	for _, r := range src {
+		token, ok := hcd.PlaceholderTokenMap[r]
+		if !ok || token == "" {
+			sb.WriteRune(r) // if the rune is not a placeholder, write it as it is
+			continue
+		}
+		var tokenToRecover string
+		if strings.HasPrefix(token, "</") { // for closing tag
+			// only get the tag itself, ignore the trailing comment (for how the comment is generated, see the code in `convert` function)
+			tokenToRecover = token[:strings.IndexByte(token, '>')+1]
+			if len(tagStack) == 0 {
+				continue // if no opening tag in stack yet, skip the closing tag
+			}
+			tagStack = tagStack[:len(tagStack)-1]
+		} else if token[0] == '<' { // for opening tag
+			tokenToRecover = token
+			tagStack = append(tagStack, token)
+		} else if token[0] == '&' { // for html entity
+			tokenToRecover = token
+		} // else: impossible
+		sb.WriteString(tokenToRecover)
+	}
+
+	if len(tagStack) > 0 {
+		// close all opening tags
+		for i := len(tagStack) - 1; i >= 0; i-- {
+			tagToClose := tagStack[i]
+			// get the closing tag "</span>" from "<span class=...>" or "<span>"
+			pos := strings.IndexAny(tagToClose, " >")
+			if pos != -1 {
+				sb.WriteString("</" + tagToClose[1:pos] + ">")
+			} // else: impossible. every tag was pushed into the stack by the code above and is valid HTML opening tag
+		}
+	}
+
+	return sb.String()
+}