diff options
Diffstat (limited to '')
-rw-r--r-- | modules/charset/ambiguous.go | 59 | ||||
-rw-r--r-- | modules/charset/ambiguous/ambiguous.json | 1 | ||||
-rw-r--r-- | modules/charset/ambiguous/generate.go | 188 | ||||
-rw-r--r-- | modules/charset/ambiguous_gen.go | 836 | ||||
-rw-r--r-- | modules/charset/ambiguous_gen_test.go | 31 | ||||
-rw-r--r-- | modules/charset/breakwriter.go | 43 | ||||
-rw-r--r-- | modules/charset/breakwriter_test.go | 68 | ||||
-rw-r--r-- | modules/charset/charset.go | 211 | ||||
-rw-r--r-- | modules/charset/charset_test.go | 385 | ||||
-rw-r--r-- | modules/charset/escape.go | 58 | ||||
-rw-r--r-- | modules/charset/escape_status.go | 27 | ||||
-rw-r--r-- | modules/charset/escape_stream.go | 289 | ||||
-rw-r--r-- | modules/charset/escape_test.go | 194 | ||||
-rw-r--r-- | modules/charset/htmlstream.go | 200 | ||||
-rw-r--r-- | modules/charset/invisible/generate.go | 121 | ||||
-rw-r--r-- | modules/charset/invisible_gen.go | 36 |
16 files changed, 2747 insertions, 0 deletions
diff --git a/modules/charset/ambiguous.go b/modules/charset/ambiguous.go new file mode 100644 index 0000000..96e0561 --- /dev/null +++ b/modules/charset/ambiguous.go @@ -0,0 +1,59 @@ +// This file is generated by modules/charset/ambiguous/generate.go DO NOT EDIT +// Copyright 2022 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package charset + +import ( + "sort" + "strings" + "unicode" + + "code.gitea.io/gitea/modules/translation" +) + +// AmbiguousTablesForLocale provides the table of ambiguous characters for this locale. +func AmbiguousTablesForLocale(locale translation.Locale) []*AmbiguousTable { + key := locale.Language() + var table *AmbiguousTable + var ok bool + for len(key) > 0 { + if table, ok = AmbiguousCharacters[key]; ok { + break + } + idx := strings.LastIndexAny(key, "-_") + if idx < 0 { + key = "" + } else { + key = key[:idx] + } + } + if table == nil && (locale.Language() == "zh-CN" || locale.Language() == "zh_CN") { + table = AmbiguousCharacters["zh-hans"] + } + if table == nil && strings.HasPrefix(locale.Language(), "zh") { + table = AmbiguousCharacters["zh-hant"] + } + if table == nil { + table = AmbiguousCharacters["_default"] + } + + return []*AmbiguousTable{ + table, + AmbiguousCharacters["_common"], + } +} + +func isAmbiguous(r rune, confusableTo *rune, tables ...*AmbiguousTable) bool { + for _, table := range tables { + if !unicode.Is(table.RangeTable, r) { + continue + } + i := sort.Search(len(table.Confusable), func(i int) bool { + return table.Confusable[i] >= r + }) + (*confusableTo) = table.With[i] + return true + } + return false +} diff --git a/modules/charset/ambiguous/ambiguous.json b/modules/charset/ambiguous/ambiguous.json new file mode 100644 index 0000000..d0f69f6 --- /dev/null +++ b/modules/charset/ambiguous/ambiguous.json @@ -0,0 +1 @@ +"{\"_common\":[8232,32,8233,32,5760,32,8192,32,8193,32,8194,32,8195,32,8196,32,8197,32,8198,32,8200,32,8201,32,8202,32,8287,32,8199,32,8239,32,2042,95,65101,95,65102,95,65103,95,8208,45,8209,45,8210,45,65112,45,1748,45,8259,45,727,45,8722,45,10134,45,11450,45,1549,44,1643,44,8218,44,184,44,42233,44,894,59,2307,58,2691,58,1417,58,1795,58,1796,58,5868,58,65072,58,6147,58,6153,58,8282,58,1475,58,760,58,42889,58,8758,58,720,58,42237,58,451,33,11601,33,660,63,577,63,2429,63,5038,63,42731,63,119149,46,8228,46,1793,46,1794,46,42510,46,68176,46,1632,46,1776,46,42232,46,1373,96,65287,96,8219,96,8242,96,1370,96,1523,96,8175,96,65344,96,900,96,8189,96,8125,96,8127,96,8190,96,697,96,884,96,712,96,714,96,715,96,756,96,699,96,701,96,700,96,702,96,42892,96,1497,96,2036,96,2037,96,5194,96,5836,96,94033,96,94034,96,65339,91,10088,40,10098,40,12308,40,64830,40,65341,93,10089,41,10099,41,12309,41,64831,41,10100,123,119060,123,10101,125,65342,94,8270,42,1645,42,8727,42,66335,42,5941,47,8257,47,8725,47,8260,47,9585,47,10187,47,10744,47,119354,47,12755,47,12339,47,11462,47,20031,47,12035,47,65340,92,65128,92,8726,92,10189,92,10741,92,10745,92,119311,92,119355,92,12756,92,20022,92,12034,92,42872,38,708,94,710,94,5869,43,10133,43,66203,43,8249,60,10094,60,706,60,119350,60,5176,60,5810,60,5120,61,11840,61,12448,61,42239,61,8250,62,10095,62,707,62,119351,62,5171,62,94015,62,8275,126,732,126,8128,126,8764,126,65372,124,65293,45,120784,50,120794,50,120804,50,120814,50,120824,50,130034,50,42842,50,423,50,1000,50,42564,50,5311,50,42735,50,119302,51,120785,51,120795,51,120805,51,120815,51,120825,51,130035,51,42923,51,540,51,439,51,42858,51,11468,51,1248,51,94011,51,71882,51,120786,52,120796,52,120806,52,120816,52,120826,52,130036,52,5070,52,71855,52,120787,53,120797,53,120807,53,120817,53,120827,53,130037,53,444,53,71867,53,120788,54,120798,54,120808,54,120818,54,120828,54,130038,54,11474,54,5102,54,71893,54,119314,55,120789,55,120799,55,120809,55,120819,55,120829,55,130039,55,66770,55,71878,55,2819,56,2538,56,2666,56,125131,56,120790,56,120800,56,120810,56,120820,56,120830,56,130040,56,547,56,546,56,66330,56,2663,57,2920,57,2541,57,3437,57,120791,57,120801,57,120811,57,120821,57,120831,57,130041,57,42862,57,11466,57,71884,57,71852,57,71894,57,9082,97,65345,97,119834,97,119886,97,119938,97,119990,97,120042,97,120094,97,120146,97,120198,97,120250,97,120302,97,120354,97,120406,97,120458,97,593,97,945,97,120514,97,120572,97,120630,97,120688,97,120746,97,65313,65,119808,65,119860,65,119912,65,119964,65,120016,65,120068,65,120120,65,120172,65,120224,65,120276,65,120328,65,120380,65,120432,65,913,65,120488,65,120546,65,120604,65,120662,65,120720,65,5034,65,5573,65,42222,65,94016,65,66208,65,119835,98,119887,98,119939,98,119991,98,120043,98,120095,98,120147,98,120199,98,120251,98,120303,98,120355,98,120407,98,120459,98,388,98,5071,98,5234,98,5551,98,65314,66,8492,66,119809,66,119861,66,119913,66,120017,66,120069,66,120121,66,120173,66,120225,66,120277,66,120329,66,120381,66,120433,66,42932,66,914,66,120489,66,120547,66,120605,66,120663,66,120721,66,5108,66,5623,66,42192,66,66178,66,66209,66,66305,66,65347,99,8573,99,119836,99,119888,99,119940,99,119992,99,120044,99,120096,99,120148,99,120200,99,120252,99,120304,99,120356,99,120408,99,120460,99,7428,99,1010,99,11429,99,43951,99,66621,99,128844,67,71922,67,71913,67,65315,67,8557,67,8450,67,8493,67,119810,67,119862,67,119914,67,119966,67,120018,67,120174,67,120226,67,120278,67,120330,67,120382,67,120434,67,1017,67,11428,67,5087,67,42202,67,66210,67,66306,67,66581,67,66844,67,8574,100,8518,100,119837,100,119889,100,119941,100,119993,100,120045,100,120097,100,120149,100,120201,100,120253,100,120305,100,120357,100,120409,100,120461,100,1281,100,5095,100,5231,100,42194,100,8558,68,8517,68,119811,68,119863,68,119915,68,119967,68,120019,68,120071,68,120123,68,120175,68,120227,68,120279,68,120331,68,120383,68,120435,68,5024,68,5598,68,5610,68,42195,68,8494,101,65349,101,8495,101,8519,101,119838,101,119890,101,119942,101,120046,101,120098,101,120150,101,120202,101,120254,101,120306,101,120358,101,120410,101,120462,101,43826,101,1213,101,8959,69,65317,69,8496,69,119812,69,119864,69,119916,69,120020,69,120072,69,120124,69,120176,69,120228,69,120280,69,120332,69,120384,69,120436,69,917,69,120492,69,120550,69,120608,69,120666,69,120724,69,11577,69,5036,69,42224,69,71846,69,71854,69,66182,69,119839,102,119891,102,119943,102,119995,102,120047,102,120099,102,120151,102,120203,102,120255,102,120307,102,120359,102,120411,102,120463,102,43829,102,42905,102,383,102,7837,102,1412,102,119315,70,8497,70,119813,70,119865,70,119917,70,120021,70,120073,70,120125,70,120177,70,120229,70,120281,70,120333,70,120385,70,120437,70,42904,70,988,70,120778,70,5556,70,42205,70,71874,70,71842,70,66183,70,66213,70,66853,70,65351,103,8458,103,119840,103,119892,103,119944,103,120048,103,120100,103,120152,103,120204,103,120256,103,120308,103,120360,103,120412,103,120464,103,609,103,7555,103,397,103,1409,103,119814,71,119866,71,119918,71,119970,71,120022,71,120074,71,120126,71,120178,71,120230,71,120282,71,120334,71,120386,71,120438,71,1292,71,5056,71,5107,71,42198,71,65352,104,8462,104,119841,104,119945,104,119997,104,120049,104,120101,104,120153,104,120205,104,120257,104,120309,104,120361,104,120413,104,120465,104,1211,104,1392,104,5058,104,65320,72,8459,72,8460,72,8461,72,119815,72,119867,72,119919,72,120023,72,120179,72,120231,72,120283,72,120335,72,120387,72,120439,72,919,72,120494,72,120552,72,120610,72,120668,72,120726,72,11406,72,5051,72,5500,72,42215,72,66255,72,731,105,9075,105,65353,105,8560,105,8505,105,8520,105,119842,105,119894,105,119946,105,119998,105,120050,105,120102,105,120154,105,120206,105,120258,105,120310,105,120362,105,120414,105,120466,105,120484,105,618,105,617,105,953,105,8126,105,890,105,120522,105,120580,105,120638,105,120696,105,120754,105,1110,105,42567,105,1231,105,43893,105,5029,105,71875,105,65354,106,8521,106,119843,106,119895,106,119947,106,119999,106,120051,106,120103,106,120155,106,120207,106,120259,106,120311,106,120363,106,120415,106,120467,106,1011,106,1112,106,65322,74,119817,74,119869,74,119921,74,119973,74,120025,74,120077,74,120129,74,120181,74,120233,74,120285,74,120337,74,120389,74,120441,74,42930,74,895,74,1032,74,5035,74,5261,74,42201,74,119844,107,119896,107,119948,107,120000,107,120052,107,120104,107,120156,107,120208,107,120260,107,120312,107,120364,107,120416,107,120468,107,8490,75,65323,75,119818,75,119870,75,119922,75,119974,75,120026,75,120078,75,120130,75,120182,75,120234,75,120286,75,120338,75,120390,75,120442,75,922,75,120497,75,120555,75,120613,75,120671,75,120729,75,11412,75,5094,75,5845,75,42199,75,66840,75,1472,108,8739,73,9213,73,65512,73,1633,108,1777,73,66336,108,125127,108,120783,73,120793,73,120803,73,120813,73,120823,73,130033,73,65321,73,8544,73,8464,73,8465,73,119816,73,119868,73,119920,73,120024,73,120128,73,120180,73,120232,73,120284,73,120336,73,120388,73,120440,73,65356,108,8572,73,8467,108,119845,108,119897,108,119949,108,120001,108,120053,108,120105,73,120157,73,120209,73,120261,73,120313,73,120365,73,120417,73,120469,73,448,73,120496,73,120554,73,120612,73,120670,73,120728,73,11410,73,1030,73,1216,73,1493,108,1503,108,1575,108,126464,108,126592,108,65166,108,65165,108,1994,108,11599,73,5825,73,42226,73,93992,73,66186,124,66313,124,119338,76,8556,76,8466,76,119819,76,119871,76,119923,76,120027,76,120079,76,120131,76,120183,76,120235,76,120287,76,120339,76,120391,76,120443,76,11472,76,5086,76,5290,76,42209,76,93974,76,71843,76,71858,76,66587,76,66854,76,65325,77,8559,77,8499,77,119820,77,119872,77,119924,77,120028,77,120080,77,120132,77,120184,77,120236,77,120288,77,120340,77,120392,77,120444,77,924,77,120499,77,120557,77,120615,77,120673,77,120731,77,1018,77,11416,77,5047,77,5616,77,5846,77,42207,77,66224,77,66321,77,119847,110,119899,110,119951,110,120003,110,120055,110,120107,110,120159,110,120211,110,120263,110,120315,110,120367,110,120419,110,120471,110,1400,110,1404,110,65326,78,8469,78,119821,78,119873,78,119925,78,119977,78,120029,78,120081,78,120185,78,120237,78,120289,78,120341,78,120393,78,120445,78,925,78,120500,78,120558,78,120616,78,120674,78,120732,78,11418,78,42208,78,66835,78,3074,111,3202,111,3330,111,3458,111,2406,111,2662,111,2790,111,3046,111,3174,111,3302,111,3430,111,3664,111,3792,111,4160,111,1637,111,1781,111,65359,111,8500,111,119848,111,119900,111,119952,111,120056,111,120108,111,120160,111,120212,111,120264,111,120316,111,120368,111,120420,111,120472,111,7439,111,7441,111,43837,111,959,111,120528,111,120586,111,120644,111,120702,111,120760,111,963,111,120532,111,120590,111,120648,111,120706,111,120764,111,11423,111,4351,111,1413,111,1505,111,1607,111,126500,111,126564,111,126596,111,65259,111,65260,111,65258,111,65257,111,1726,111,64428,111,64429,111,64427,111,64426,111,1729,111,64424,111,64425,111,64423,111,64422,111,1749,111,3360,111,4125,111,66794,111,71880,111,71895,111,66604,111,1984,79,2534,79,2918,79,12295,79,70864,79,71904,79,120782,79,120792,79,120802,79,120812,79,120822,79,130032,79,65327,79,119822,79,119874,79,119926,79,119978,79,120030,79,120082,79,120134,79,120186,79,120238,79,120290,79,120342,79,120394,79,120446,79,927,79,120502,79,120560,79,120618,79,120676,79,120734,79,11422,79,1365,79,11604,79,4816,79,2848,79,66754,79,42227,79,71861,79,66194,79,66219,79,66564,79,66838,79,9076,112,65360,112,119849,112,119901,112,119953,112,120005,112,120057,112,120109,112,120161,112,120213,112,120265,112,120317,112,120369,112,120421,112,120473,112,961,112,120530,112,120544,112,120588,112,120602,112,120646,112,120660,112,120704,112,120718,112,120762,112,120776,112,11427,112,65328,80,8473,80,119823,80,119875,80,119927,80,119979,80,120031,80,120083,80,120187,80,120239,80,120291,80,120343,80,120395,80,120447,80,929,80,120504,80,120562,80,120620,80,120678,80,120736,80,11426,80,5090,80,5229,80,42193,80,66197,80,119850,113,119902,113,119954,113,120006,113,120058,113,120110,113,120162,113,120214,113,120266,113,120318,113,120370,113,120422,113,120474,113,1307,113,1379,113,1382,113,8474,81,119824,81,119876,81,119928,81,119980,81,120032,81,120084,81,120188,81,120240,81,120292,81,120344,81,120396,81,120448,81,11605,81,119851,114,119903,114,119955,114,120007,114,120059,114,120111,114,120163,114,120215,114,120267,114,120319,114,120371,114,120423,114,120475,114,43847,114,43848,114,7462,114,11397,114,43905,114,119318,82,8475,82,8476,82,8477,82,119825,82,119877,82,119929,82,120033,82,120189,82,120241,82,120293,82,120345,82,120397,82,120449,82,422,82,5025,82,5074,82,66740,82,5511,82,42211,82,94005,82,65363,115,119852,115,119904,115,119956,115,120008,115,120060,115,120112,115,120164,115,120216,115,120268,115,120320,115,120372,115,120424,115,120476,115,42801,115,445,115,1109,115,43946,115,71873,115,66632,115,65331,83,119826,83,119878,83,119930,83,119982,83,120034,83,120086,83,120138,83,120190,83,120242,83,120294,83,120346,83,120398,83,120450,83,1029,83,1359,83,5077,83,5082,83,42210,83,94010,83,66198,83,66592,83,119853,116,119905,116,119957,116,120009,116,120061,116,120113,116,120165,116,120217,116,120269,116,120321,116,120373,116,120425,116,120477,116,8868,84,10201,84,128872,84,65332,84,119827,84,119879,84,119931,84,119983,84,120035,84,120087,84,120139,84,120191,84,120243,84,120295,84,120347,84,120399,84,120451,84,932,84,120507,84,120565,84,120623,84,120681,84,120739,84,11430,84,5026,84,42196,84,93962,84,71868,84,66199,84,66225,84,66325,84,119854,117,119906,117,119958,117,120010,117,120062,117,120114,117,120166,117,120218,117,120270,117,120322,117,120374,117,120426,117,120478,117,42911,117,7452,117,43854,117,43858,117,651,117,965,117,120534,117,120592,117,120650,117,120708,117,120766,117,1405,117,66806,117,71896,117,8746,85,8899,85,119828,85,119880,85,119932,85,119984,85,120036,85,120088,85,120140,85,120192,85,120244,85,120296,85,120348,85,120400,85,120452,85,1357,85,4608,85,66766,85,5196,85,42228,85,94018,85,71864,85,8744,118,8897,118,65366,118,8564,118,119855,118,119907,118,119959,118,120011,118,120063,118,120115,118,120167,118,120219,118,120271,118,120323,118,120375,118,120427,118,120479,118,7456,118,957,118,120526,118,120584,118,120642,118,120700,118,120758,118,1141,118,1496,118,71430,118,43945,118,71872,118,119309,86,1639,86,1783,86,8548,86,119829,86,119881,86,119933,86,119985,86,120037,86,120089,86,120141,86,120193,86,120245,86,120297,86,120349,86,120401,86,120453,86,1140,86,11576,86,5081,86,5167,86,42719,86,42214,86,93960,86,71840,86,66845,86,623,119,119856,119,119908,119,119960,119,120012,119,120064,119,120116,119,120168,119,120220,119,120272,119,120324,119,120376,119,120428,119,120480,119,7457,119,1121,119,1309,119,1377,119,71434,119,71438,119,71439,119,43907,119,71919,87,71910,87,119830,87,119882,87,119934,87,119986,87,120038,87,120090,87,120142,87,120194,87,120246,87,120298,87,120350,87,120402,87,120454,87,1308,87,5043,87,5076,87,42218,87,5742,120,10539,120,10540,120,10799,120,65368,120,8569,120,119857,120,119909,120,119961,120,120013,120,120065,120,120117,120,120169,120,120221,120,120273,120,120325,120,120377,120,120429,120,120481,120,5441,120,5501,120,5741,88,9587,88,66338,88,71916,88,65336,88,8553,88,119831,88,119883,88,119935,88,119987,88,120039,88,120091,88,120143,88,120195,88,120247,88,120299,88,120351,88,120403,88,120455,88,42931,88,935,88,120510,88,120568,88,120626,88,120684,88,120742,88,11436,88,11613,88,5815,88,42219,88,66192,88,66228,88,66327,88,66855,88,611,121,7564,121,65369,121,119858,121,119910,121,119962,121,120014,121,120066,121,120118,121,120170,121,120222,121,120274,121,120326,121,120378,121,120430,121,120482,121,655,121,7935,121,43866,121,947,121,8509,121,120516,121,120574,121,120632,121,120690,121,120748,121,1199,121,4327,121,71900,121,65337,89,119832,89,119884,89,119936,89,119988,89,120040,89,120092,89,120144,89,120196,89,120248,89,120300,89,120352,89,120404,89,120456,89,933,89,978,89,120508,89,120566,89,120624,89,120682,89,120740,89,11432,89,1198,89,5033,89,5053,89,42220,89,94019,89,71844,89,66226,89,119859,122,119911,122,119963,122,120015,122,120067,122,120119,122,120171,122,120223,122,120275,122,120327,122,120379,122,120431,122,120483,122,7458,122,43923,122,71876,122,66293,90,71909,90,65338,90,8484,90,8488,90,119833,90,119885,90,119937,90,119989,90,120041,90,120197,90,120249,90,120301,90,120353,90,120405,90,120457,90,918,90,120493,90,120551,90,120609,90,120667,90,120725,90,5059,90,42204,90,71849,90,65282,34,65284,36,65285,37,65286,38,65290,42,65291,43,65294,46,65295,47,65296,48,65297,49,65298,50,65299,51,65300,52,65301,53,65302,54,65303,55,65304,56,65305,57,65308,60,65309,61,65310,62,65312,64,65316,68,65318,70,65319,71,65324,76,65329,81,65330,82,65333,85,65334,86,65335,87,65343,95,65346,98,65348,100,65350,102,65355,107,65357,109,65358,110,65361,113,65362,114,65364,116,65365,117,65367,119,65370,122,65371,123,65373,125],\"_default\":[160,32,8211,45,65374,126,65306,58,65281,33,8216,96,8217,96,8245,96,180,96,12494,47,1047,51,1073,54,1072,97,1040,65,1068,98,1042,66,1089,99,1057,67,1077,101,1045,69,1053,72,305,105,1050,75,921,73,1052,77,1086,111,1054,79,1009,112,1088,112,1056,80,1075,114,1058,84,215,120,1093,120,1061,88,1091,121,1059,89,65283,35,65288,40,65289,41,65292,44,65307,59,65311,63],\"cs\":[65374,126,65306,58,65281,33,8216,96,8217,96,8245,96,180,96,12494,47,1047,51,1073,54,1072,97,1040,65,1068,98,1042,66,1089,99,1057,67,1077,101,1045,69,1053,72,305,105,1050,75,921,73,1052,77,1086,111,1054,79,1009,112,1088,112,1056,80,1075,114,1058,84,1093,120,1061,88,1091,121,1059,89,65283,35,65288,40,65289,41,65292,44,65307,59,65311,63],\"de\":[65374,126,65306,58,65281,33,8216,96,8217,96,8245,96,180,96,12494,47,1047,51,1073,54,1072,97,1040,65,1068,98,1042,66,1089,99,1057,67,1077,101,1045,69,1053,72,305,105,1050,75,921,73,1052,77,1086,111,1054,79,1009,112,1088,112,1056,80,1075,114,1058,84,1093,120,1061,88,1091,121,1059,89,65283,35,65288,40,65289,41,65292,44,65307,59,65311,63],\"es\":[8211,45,65374,126,65306,58,65281,33,8245,96,180,96,12494,47,1047,51,1073,54,1072,97,1040,65,1068,98,1042,66,1089,99,1057,67,1077,101,1045,69,1053,72,305,105,1050,75,1052,77,1086,111,1054,79,1009,112,1088,112,1056,80,1075,114,1058,84,215,120,1093,120,1061,88,1091,121,1059,89,65283,35,65288,40,65289,41,65292,44,65307,59,65311,63],\"fr\":[65374,126,65306,58,65281,33,8216,96,8245,96,12494,47,1047,51,1073,54,1072,97,1040,65,1068,98,1042,66,1089,99,1057,67,1077,101,1045,69,1053,72,305,105,1050,75,921,73,1052,77,1086,111,1054,79,1009,112,1088,112,1056,80,1075,114,1058,84,215,120,1093,120,1061,88,1091,121,1059,89,65283,35,65288,40,65289,41,65292,44,65307,59,65311,63],\"it\":[160,32,8211,45,65374,126,65306,58,65281,33,8216,96,8245,96,180,96,12494,47,1047,51,1073,54,1072,97,1040,65,1068,98,1042,66,1089,99,1057,67,1077,101,1045,69,1053,72,305,105,1050,75,921,73,1052,77,1086,111,1054,79,1009,112,1088,112,1056,80,1075,114,1058,84,215,120,1093,120,1061,88,1091,121,1059,89,65283,35,65288,40,65289,41,65292,44,65307,59,65311,63],\"ja\":[8211,45,65306,58,65281,33,8216,96,8217,96,8245,96,180,96,1047,51,1073,54,1072,97,1040,65,1068,98,1042,66,1089,99,1057,67,1077,101,1045,69,1053,72,305,105,1050,75,921,73,1052,77,1086,111,1054,79,1009,112,1088,112,1056,80,1075,114,1058,84,215,120,1093,120,1061,88,1091,121,1059,89,65283,35,65292,44,65307,59],\"ko\":[8211,45,65374,126,65306,58,65281,33,8245,96,180,96,12494,47,1047,51,1073,54,1072,97,1040,65,1068,98,1042,66,1089,99,1057,67,1077,101,1045,69,1053,72,305,105,1050,75,921,73,1052,77,1086,111,1054,79,1009,112,1088,112,1056,80,1075,114,1058,84,215,120,1093,120,1061,88,1091,121,1059,89,65283,35,65288,40,65289,41,65292,44,65307,59,65311,63],\"pl\":[65374,126,65306,58,65281,33,8216,96,8217,96,8245,96,180,96,12494,47,1047,51,1073,54,1072,97,1040,65,1068,98,1042,66,1089,99,1057,67,1077,101,1045,69,1053,72,305,105,1050,75,921,73,1052,77,1086,111,1054,79,1009,112,1088,112,1056,80,1075,114,1058,84,215,120,1093,120,1061,88,1091,121,1059,89,65283,35,65288,40,65289,41,65292,44,65307,59,65311,63],\"pt-BR\":[65374,126,65306,58,65281,33,8216,96,8217,96,8245,96,180,96,12494,47,1047,51,1073,54,1072,97,1040,65,1068,98,1042,66,1089,99,1057,67,1077,101,1045,69,1053,72,305,105,1050,75,921,73,1052,77,1086,111,1054,79,1009,112,1088,112,1056,80,1075,114,1058,84,215,120,1093,120,1061,88,1091,121,1059,89,65283,35,65288,40,65289,41,65292,44,65307,59,65311,63],\"qps-ploc\":[160,32,8211,45,65374,126,65306,58,65281,33,8216,96,8217,96,8245,96,180,96,12494,47,1047,51,1073,54,1072,97,1040,65,1068,98,1042,66,1089,99,1057,67,1077,101,1045,69,1053,72,305,105,1050,75,921,73,1052,77,1086,111,1054,79,1088,112,1056,80,1075,114,1058,84,215,120,1093,120,1061,88,1091,121,1059,89,65283,35,65288,40,65289,41,65292,44,65307,59,65311,63],\"ru\":[65374,126,65306,58,65281,33,8216,96,8217,96,8245,96,180,96,12494,47,305,105,921,73,1009,112,215,120,65283,35,65288,40,65289,41,65292,44,65307,59,65311,63],\"tr\":[160,32,8211,45,65374,126,65306,58,65281,33,8245,96,180,96,12494,47,1047,51,1073,54,1072,97,1040,65,1068,98,1042,66,1089,99,1057,67,1077,101,1045,69,1053,72,1050,75,921,73,1052,77,1086,111,1054,79,1009,112,1088,112,1056,80,1075,114,1058,84,215,120,1093,120,1061,88,1091,121,1059,89,65283,35,65288,40,65289,41,65292,44,65307,59,65311,63],\"zh-hans\":[65374,126,65306,58,65281,33,8245,96,180,96,12494,47,1047,51,1073,54,1072,97,1040,65,1068,98,1042,66,1089,99,1057,67,1077,101,1045,69,1053,72,305,105,1050,75,921,73,1052,77,1086,111,1054,79,1009,112,1088,112,1056,80,1075,114,1058,84,215,120,1093,120,1061,88,1091,121,1059,89,65288,40,65289,41],\"zh-hant\":[8211,45,65374,126,180,96,12494,47,1047,51,1073,54,1072,97,1040,65,1068,98,1042,66,1089,99,1057,67,1077,101,1045,69,1053,72,305,105,1050,75,921,73,1052,77,1086,111,1054,79,1009,112,1088,112,1056,80,1075,114,1058,84,215,120,1093,120,1061,88,1091,121,1059,89,65283,35,65307,59]}"
\ No newline at end of file diff --git a/modules/charset/ambiguous/generate.go b/modules/charset/ambiguous/generate.go new file mode 100644 index 0000000..e3fda5b --- /dev/null +++ b/modules/charset/ambiguous/generate.go @@ -0,0 +1,188 @@ +// Copyright 2022 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package main + +import ( + "bytes" + "flag" + "fmt" + "go/format" + "os" + "sort" + "text/template" + "unicode" + + "code.gitea.io/gitea/modules/json" + + "golang.org/x/text/unicode/rangetable" +) + +// ambiguous.json provides a one to one mapping of ambiguous characters to other characters +// See https://github.com/hediet/vscode-unicode-data/blob/main/out/ambiguous.json + +type AmbiguousTable struct { + Confusable []rune + With []rune + Locale string + RangeTable *unicode.RangeTable +} + +type RunePair struct { + Confusable rune + With rune +} + +var verbose bool + +func main() { + flag.Usage = func() { + fmt.Fprintf(os.Stderr, `%s: Generate AmbiguousCharacter + +Usage: %[1]s [-v] [-o output.go] ambiguous.json +`, os.Args[0]) + flag.PrintDefaults() + } + + output := "" + flag.BoolVar(&verbose, "v", false, "verbose output") + flag.StringVar(&output, "o", "ambiguous_gen.go", "file to output to") + flag.Parse() + input := flag.Arg(0) + if input == "" { + input = "ambiguous.json" + } + + bs, err := os.ReadFile(input) + if err != nil { + fatalf("Unable to read: %s Err: %v", input, err) + } + + var unwrapped string + if err := json.Unmarshal(bs, &unwrapped); err != nil { + fatalf("Unable to unwrap content in: %s Err: %v", input, err) + } + + fromJSON := map[string][]uint32{} + if err := json.Unmarshal([]byte(unwrapped), &fromJSON); err != nil { + fatalf("Unable to unmarshal content in: %s Err: %v", input, err) + } + + tables := make([]*AmbiguousTable, 0, len(fromJSON)) + for locale, chars := range fromJSON { + table := &AmbiguousTable{Locale: locale} + table.Confusable = make([]rune, 0, len(chars)/2) + table.With = make([]rune, 0, len(chars)/2) + pairs := make([]RunePair, len(chars)/2) + for i := 0; i < len(chars); i += 2 { + pairs[i/2].Confusable, pairs[i/2].With = rune(chars[i]), rune(chars[i+1]) + } + sort.Slice(pairs, func(i, j int) bool { + return pairs[i].Confusable < pairs[j].Confusable + }) + for _, pair := range pairs { + table.Confusable = append(table.Confusable, pair.Confusable) + table.With = append(table.With, pair.With) + } + table.RangeTable = rangetable.New(table.Confusable...) + tables = append(tables, table) + } + sort.Slice(tables, func(i, j int) bool { + return tables[i].Locale < tables[j].Locale + }) + data := map[string]any{ + "Tables": tables, + } + + if err := runTemplate(generatorTemplate, output, &data); err != nil { + fatalf("Unable to run template: %v", err) + } +} + +func runTemplate(t *template.Template, filename string, data any) error { + buf := bytes.NewBuffer(nil) + if err := t.Execute(buf, data); err != nil { + return fmt.Errorf("unable to execute template: %w", err) + } + bs, err := format.Source(buf.Bytes()) + if err != nil { + verbosef("Bad source:\n%s", buf.String()) + return fmt.Errorf("unable to format source: %w", err) + } + + old, err := os.ReadFile(filename) + if err != nil && !os.IsNotExist(err) { + return fmt.Errorf("failed to read old file %s because %w", filename, err) + } else if err == nil { + if bytes.Equal(bs, old) { + // files are the same don't rewrite it. + return nil + } + } + + file, err := os.Create(filename) + if err != nil { + return fmt.Errorf("failed to create file %s because %w", filename, err) + } + defer file.Close() + _, err = file.Write(bs) + if err != nil { + return fmt.Errorf("unable to write generated source: %w", err) + } + return nil +} + +var generatorTemplate = template.Must(template.New("ambiguousTemplate").Parse(`// This file is generated by modules/charset/ambiguous/generate.go DO NOT EDIT +// Copyright 2022 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + + +package charset + +import "unicode" + +// This file is generated from https://github.com/hediet/vscode-unicode-data/blob/main/out/ambiguous.json + +// AmbiguousTable matches a confusable rune with its partner for the Locale +type AmbiguousTable struct { + Confusable []rune + With []rune + Locale string + RangeTable *unicode.RangeTable +} + +// AmbiguousCharacters provides a map by locale name to the confusable characters in that locale +var AmbiguousCharacters = map[string]*AmbiguousTable{ + {{range .Tables}}{{printf "%q:" .Locale}} { + Confusable: []rune{ {{range .Confusable}}{{.}},{{end}} }, + With: []rune{ {{range .With}}{{.}},{{end}} }, + Locale: {{printf "%q" .Locale}}, + RangeTable: &unicode.RangeTable{ + R16: []unicode.Range16{ + {{range .RangeTable.R16 }} {Lo:{{.Lo}}, Hi:{{.Hi}}, Stride: {{.Stride}}}, + {{end}} }, + R32: []unicode.Range32{ + {{range .RangeTable.R32}} {Lo:{{.Lo}}, Hi:{{.Hi}}, Stride: {{.Stride}}}, + {{end}} }, + LatinOffset: {{.RangeTable.LatinOffset}}, + }, + }, + {{end}} +} + +`)) + +func logf(format string, args ...any) { + fmt.Fprintf(os.Stderr, format+"\n", args...) +} + +func verbosef(format string, args ...any) { + if verbose { + logf(format, args...) + } +} + +func fatalf(format string, args ...any) { + logf("fatal: "+format+"\n", args...) + os.Exit(1) +} diff --git a/modules/charset/ambiguous_gen.go b/modules/charset/ambiguous_gen.go new file mode 100644 index 0000000..c88ffd5 --- /dev/null +++ b/modules/charset/ambiguous_gen.go @@ -0,0 +1,836 @@ +// This file is generated by modules/charset/ambiguous/generate.go DO NOT EDIT +// Copyright 2022 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package charset + +import "unicode" + +// This file is generated from https://github.com/hediet/vscode-unicode-data/blob/main/out/ambiguous.json + +// AmbiguousTable matches a confusable rune with its partner for the Locale +type AmbiguousTable struct { + Confusable []rune + With []rune + Locale string + RangeTable *unicode.RangeTable +} + +// AmbiguousCharacters provides a map by locale name to the confusable characters in that locale +var AmbiguousCharacters = map[string]*AmbiguousTable{ + "_common": { + Confusable: []rune{184, 383, 388, 397, 422, 423, 439, 444, 445, 448, 451, 540, 546, 547, 577, 593, 609, 611, 617, 618, 623, 651, 655, 660, 697, 699, 700, 701, 702, 706, 707, 708, 710, 712, 714, 715, 720, 727, 731, 732, 756, 760, 884, 890, 894, 895, 900, 913, 914, 917, 918, 919, 922, 924, 925, 927, 929, 932, 933, 935, 945, 947, 953, 957, 959, 961, 963, 965, 978, 988, 1000, 1010, 1011, 1017, 1018, 1029, 1030, 1032, 1109, 1110, 1112, 1121, 1140, 1141, 1198, 1199, 1211, 1213, 1216, 1231, 1248, 1281, 1292, 1307, 1308, 1309, 1357, 1359, 1365, 1370, 1373, 1377, 1379, 1382, 1392, 1400, 1404, 1405, 1409, 1412, 1413, 1417, 1472, 1475, 1493, 1496, 1497, 1503, 1505, 1523, 1549, 1575, 1607, 1632, 1633, 1637, 1639, 1643, 1645, 1726, 1729, 1748, 1749, 1776, 1777, 1781, 1783, 1793, 1794, 1795, 1796, 1984, 1994, 2036, 2037, 2042, 2307, 2406, 2429, 2534, 2538, 2541, 2662, 2663, 2666, 2691, 2790, 2819, 2848, 2918, 2920, 3046, 3074, 3174, 3202, 3302, 3330, 3360, 3430, 3437, 3458, 3664, 3792, 4125, 4160, 4327, 4351, 4608, 4816, 5024, 5025, 5026, 5029, 5033, 5034, 5035, 5036, 5038, 5043, 5047, 5051, 5053, 5056, 5058, 5059, 5070, 5071, 5074, 5076, 5077, 5081, 5082, 5086, 5087, 5090, 5094, 5095, 5102, 5107, 5108, 5120, 5167, 5171, 5176, 5194, 5196, 5229, 5231, 5234, 5261, 5290, 5311, 5441, 5500, 5501, 5511, 5551, 5556, 5573, 5598, 5610, 5616, 5623, 5741, 5742, 5760, 5810, 5815, 5825, 5836, 5845, 5846, 5868, 5869, 5941, 6147, 6153, 7428, 7439, 7441, 7452, 7456, 7457, 7458, 7462, 7555, 7564, 7837, 7935, 8125, 8126, 8127, 8128, 8175, 8189, 8190, 8192, 8193, 8194, 8195, 8196, 8197, 8198, 8199, 8200, 8201, 8202, 8208, 8209, 8210, 8218, 8219, 8228, 8232, 8233, 8239, 8242, 8249, 8250, 8257, 8259, 8260, 8270, 8275, 8282, 8287, 8450, 8458, 8459, 8460, 8461, 8462, 8464, 8465, 8466, 8467, 8469, 8473, 8474, 8475, 8476, 8477, 8484, 8488, 8490, 8492, 8493, 8494, 8495, 8496, 8497, 8499, 8500, 8505, 8509, 8517, 8518, 8519, 8520, 8521, 8544, 8548, 8553, 8556, 8557, 8558, 8559, 8560, 8564, 8569, 8572, 8573, 8574, 8722, 8725, 8726, 8727, 8739, 8744, 8746, 8758, 8764, 8868, 8897, 8899, 8959, 9075, 9076, 9082, 9213, 9585, 9587, 10088, 10089, 10094, 10095, 10098, 10099, 10100, 10101, 10133, 10134, 10187, 10189, 10201, 10539, 10540, 10741, 10744, 10745, 10799, 11397, 11406, 11410, 11412, 11416, 11418, 11422, 11423, 11426, 11427, 11428, 11429, 11430, 11432, 11436, 11450, 11462, 11466, 11468, 11472, 11474, 11576, 11577, 11599, 11601, 11604, 11605, 11613, 11840, 12034, 12035, 12295, 12308, 12309, 12339, 12448, 12755, 12756, 20022, 20031, 42192, 42193, 42194, 42195, 42196, 42198, 42199, 42201, 42202, 42204, 42205, 42207, 42208, 42209, 42210, 42211, 42214, 42215, 42218, 42219, 42220, 42222, 42224, 42226, 42227, 42228, 42232, 42233, 42237, 42239, 42510, 42564, 42567, 42719, 42731, 42735, 42801, 42842, 42858, 42862, 42872, 42889, 42892, 42904, 42905, 42911, 42923, 42930, 42931, 42932, 43826, 43829, 43837, 43847, 43848, 43854, 43858, 43866, 43893, 43905, 43907, 43923, 43945, 43946, 43951, 64422, 64423, 64424, 64425, 64426, 64427, 64428, 64429, 64830, 64831, 65072, 65101, 65102, 65103, 65112, 65128, 65165, 65166, 65257, 65258, 65259, 65260, 65282, 65284, 65285, 65286, 65287, 65290, 65291, 65293, 65294, 65295, 65296, 65297, 65298, 65299, 65300, 65301, 65302, 65303, 65304, 65305, 65308, 65309, 65310, 65312, 65313, 65314, 65315, 65316, 65317, 65318, 65319, 65320, 65321, 65322, 65323, 65324, 65325, 65326, 65327, 65328, 65329, 65330, 65331, 65332, 65333, 65334, 65335, 65336, 65337, 65338, 65339, 65340, 65341, 65342, 65343, 65344, 65345, 65346, 65347, 65348, 65349, 65350, 65351, 65352, 65353, 65354, 65355, 65356, 65357, 65358, 65359, 65360, 65361, 65362, 65363, 65364, 65365, 65366, 65367, 65368, 65369, 65370, 65371, 65372, 65373, 65512, 66178, 66182, 66183, 66186, 66192, 66194, 66197, 66198, 66199, 66203, 66208, 66209, 66210, 66213, 66219, 66224, 66225, 66226, 66228, 66255, 66293, 66305, 66306, 66313, 66321, 66325, 66327, 66330, 66335, 66336, 66338, 66564, 66581, 66587, 66592, 66604, 66621, 66632, 66740, 66754, 66766, 66770, 66794, 66806, 66835, 66838, 66840, 66844, 66845, 66853, 66854, 66855, 68176, 70864, 71430, 71434, 71438, 71439, 71840, 71842, 71843, 71844, 71846, 71849, 71852, 71854, 71855, 71858, 71861, 71864, 71867, 71868, 71872, 71873, 71874, 71875, 71876, 71878, 71880, 71882, 71884, 71893, 71894, 71895, 71896, 71900, 71904, 71909, 71910, 71913, 71916, 71919, 71922, 93960, 93962, 93974, 93992, 94005, 94010, 94011, 94015, 94016, 94018, 94019, 94033, 94034, 119060, 119149, 119302, 119309, 119311, 119314, 119315, 119318, 119338, 119350, 119351, 119354, 119355, 119808, 119809, 119810, 119811, 119812, 119813, 119814, 119815, 119816, 119817, 119818, 119819, 119820, 119821, 119822, 119823, 119824, 119825, 119826, 119827, 119828, 119829, 119830, 119831, 119832, 119833, 119834, 119835, 119836, 119837, 119838, 119839, 119840, 119841, 119842, 119843, 119844, 119845, 119847, 119848, 119849, 119850, 119851, 119852, 119853, 119854, 119855, 119856, 119857, 119858, 119859, 119860, 119861, 119862, 119863, 119864, 119865, 119866, 119867, 119868, 119869, 119870, 119871, 119872, 119873, 119874, 119875, 119876, 119877, 119878, 119879, 119880, 119881, 119882, 119883, 119884, 119885, 119886, 119887, 119888, 119889, 119890, 119891, 119892, 119894, 119895, 119896, 119897, 119899, 119900, 119901, 119902, 119903, 119904, 119905, 119906, 119907, 119908, 119909, 119910, 119911, 119912, 119913, 119914, 119915, 119916, 119917, 119918, 119919, 119920, 119921, 119922, 119923, 119924, 119925, 119926, 119927, 119928, 119929, 119930, 119931, 119932, 119933, 119934, 119935, 119936, 119937, 119938, 119939, 119940, 119941, 119942, 119943, 119944, 119945, 119946, 119947, 119948, 119949, 119951, 119952, 119953, 119954, 119955, 119956, 119957, 119958, 119959, 119960, 119961, 119962, 119963, 119964, 119966, 119967, 119970, 119973, 119974, 119977, 119978, 119979, 119980, 119982, 119983, 119984, 119985, 119986, 119987, 119988, 119989, 119990, 119991, 119992, 119993, 119995, 119997, 119998, 119999, 120000, 120001, 120003, 120005, 120006, 120007, 120008, 120009, 120010, 120011, 120012, 120013, 120014, 120015, 120016, 120017, 120018, 120019, 120020, 120021, 120022, 120023, 120024, 120025, 120026, 120027, 120028, 120029, 120030, 120031, 120032, 120033, 120034, 120035, 120036, 120037, 120038, 120039, 120040, 120041, 120042, 120043, 120044, 120045, 120046, 120047, 120048, 120049, 120050, 120051, 120052, 120053, 120055, 120056, 120057, 120058, 120059, 120060, 120061, 120062, 120063, 120064, 120065, 120066, 120067, 120068, 120069, 120071, 120072, 120073, 120074, 120077, 120078, 120079, 120080, 120081, 120082, 120083, 120084, 120086, 120087, 120088, 120089, 120090, 120091, 120092, 120094, 120095, 120096, 120097, 120098, 120099, 120100, 120101, 120102, 120103, 120104, 120105, 120107, 120108, 120109, 120110, 120111, 120112, 120113, 120114, 120115, 120116, 120117, 120118, 120119, 120120, 120121, 120123, 120124, 120125, 120126, 120128, 120129, 120130, 120131, 120132, 120134, 120138, 120139, 120140, 120141, 120142, 120143, 120144, 120146, 120147, 120148, 120149, 120150, 120151, 120152, 120153, 120154, 120155, 120156, 120157, 120159, 120160, 120161, 120162, 120163, 120164, 120165, 120166, 120167, 120168, 120169, 120170, 120171, 120172, 120173, 120174, 120175, 120176, 120177, 120178, 120179, 120180, 120181, 120182, 120183, 120184, 120185, 120186, 120187, 120188, 120189, 120190, 120191, 120192, 120193, 120194, 120195, 120196, 120197, 120198, 120199, 120200, 120201, 120202, 120203, 120204, 120205, 120206, 120207, 120208, 120209, 120211, 120212, 120213, 120214, 120215, 120216, 120217, 120218, 120219, 120220, 120221, 120222, 120223, 120224, 120225, 120226, 120227, 120228, 120229, 120230, 120231, 120232, 120233, 120234, 120235, 120236, 120237, 120238, 120239, 120240, 120241, 120242, 120243, 120244, 120245, 120246, 120247, 120248, 120249, 120250, 120251, 120252, 120253, 120254, 120255, 120256, 120257, 120258, 120259, 120260, 120261, 120263, 120264, 120265, 120266, 120267, 120268, 120269, 120270, 120271, 120272, 120273, 120274, 120275, 120276, 120277, 120278, 120279, 120280, 120281, 120282, 120283, 120284, 120285, 120286, 120287, 120288, 120289, 120290, 120291, 120292, 120293, 120294, 120295, 120296, 120297, 120298, 120299, 120300, 120301, 120302, 120303, 120304, 120305, 120306, 120307, 120308, 120309, 120310, 120311, 120312, 120313, 120315, 120316, 120317, 120318, 120319, 120320, 120321, 120322, 120323, 120324, 120325, 120326, 120327, 120328, 120329, 120330, 120331, 120332, 120333, 120334, 120335, 120336, 120337, 120338, 120339, 120340, 120341, 120342, 120343, 120344, 120345, 120346, 120347, 120348, 120349, 120350, 120351, 120352, 120353, 120354, 120355, 120356, 120357, 120358, 120359, 120360, 120361, 120362, 120363, 120364, 120365, 120367, 120368, 120369, 120370, 120371, 120372, 120373, 120374, 120375, 120376, 120377, 120378, 120379, 120380, 120381, 120382, 120383, 120384, 120385, 120386, 120387, 120388, 120389, 120390, 120391, 120392, 120393, 120394, 120395, 120396, 120397, 120398, 120399, 120400, 120401, 120402, 120403, 120404, 120405, 120406, 120407, 120408, 120409, 120410, 120411, 120412, 120413, 120414, 120415, 120416, 120417, 120419, 120420, 120421, 120422, 120423, 120424, 120425, 120426, 120427, 120428, 120429, 120430, 120431, 120432, 120433, 120434, 120435, 120436, 120437, 120438, 120439, 120440, 120441, 120442, 120443, 120444, 120445, 120446, 120447, 120448, 120449, 120450, 120451, 120452, 120453, 120454, 120455, 120456, 120457, 120458, 120459, 120460, 120461, 120462, 120463, 120464, 120465, 120466, 120467, 120468, 120469, 120471, 120472, 120473, 120474, 120475, 120476, 120477, 120478, 120479, 120480, 120481, 120482, 120483, 120484, 120488, 120489, 120492, 120493, 120494, 120496, 120497, 120499, 120500, 120502, 120504, 120507, 120508, 120510, 120514, 120516, 120522, 120526, 120528, 120530, 120532, 120534, 120544, 120546, 120547, 120550, 120551, 120552, 120554, 120555, 120557, 120558, 120560, 120562, 120565, 120566, 120568, 120572, 120574, 120580, 120584, 120586, 120588, 120590, 120592, 120602, 120604, 120605, 120608, 120609, 120610, 120612, 120613, 120615, 120616, 120618, 120620, 120623, 120624, 120626, 120630, 120632, 120638, 120642, 120644, 120646, 120648, 120650, 120660, 120662, 120663, 120666, 120667, 120668, 120670, 120671, 120673, 120674, 120676, 120678, 120681, 120682, 120684, 120688, 120690, 120696, 120700, 120702, 120704, 120706, 120708, 120718, 120720, 120721, 120724, 120725, 120726, 120728, 120729, 120731, 120732, 120734, 120736, 120739, 120740, 120742, 120746, 120748, 120754, 120758, 120760, 120762, 120764, 120766, 120776, 120778, 120782, 120783, 120784, 120785, 120786, 120787, 120788, 120789, 120790, 120791, 120792, 120793, 120794, 120795, 120796, 120797, 120798, 120799, 120800, 120801, 120802, 120803, 120804, 120805, 120806, 120807, 120808, 120809, 120810, 120811, 120812, 120813, 120814, 120815, 120816, 120817, 120818, 120819, 120820, 120821, 120822, 120823, 120824, 120825, 120826, 120827, 120828, 120829, 120830, 120831, 125127, 125131, 126464, 126500, 126564, 126592, 126596, 128844, 128872, 130032, 130033, 130034, 130035, 130036, 130037, 130038, 130039, 130040, 130041}, + With: []rune{44, 102, 98, 103, 82, 50, 51, 53, 115, 73, 33, 51, 56, 56, 63, 97, 103, 121, 105, 105, 119, 117, 121, 63, 96, 96, 96, 96, 96, 60, 62, 94, 94, 96, 96, 96, 58, 45, 105, 126, 96, 58, 96, 105, 59, 74, 96, 65, 66, 69, 90, 72, 75, 77, 78, 79, 80, 84, 89, 88, 97, 121, 105, 118, 111, 112, 111, 117, 89, 70, 50, 99, 106, 67, 77, 83, 73, 74, 115, 105, 106, 119, 86, 118, 89, 121, 104, 101, 73, 105, 51, 100, 71, 113, 87, 119, 85, 83, 79, 96, 96, 119, 113, 113, 104, 110, 110, 117, 103, 102, 111, 58, 108, 58, 108, 118, 96, 108, 111, 96, 44, 108, 111, 46, 108, 111, 86, 44, 42, 111, 111, 45, 111, 46, 73, 111, 86, 46, 46, 58, 58, 79, 108, 96, 96, 95, 58, 111, 63, 79, 56, 57, 111, 57, 56, 58, 111, 56, 79, 79, 57, 111, 111, 111, 111, 111, 111, 111, 111, 57, 111, 111, 111, 111, 111, 121, 111, 85, 79, 68, 82, 84, 105, 89, 65, 74, 69, 63, 87, 77, 72, 89, 71, 104, 90, 52, 98, 82, 87, 83, 86, 83, 76, 67, 80, 75, 100, 54, 71, 66, 61, 86, 62, 60, 96, 85, 80, 100, 98, 74, 76, 50, 120, 72, 120, 82, 98, 70, 65, 68, 68, 77, 66, 88, 120, 32, 60, 88, 73, 96, 75, 77, 58, 43, 47, 58, 58, 99, 111, 111, 117, 118, 119, 122, 114, 103, 121, 102, 121, 96, 105, 96, 126, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 45, 45, 45, 44, 96, 46, 32, 32, 32, 96, 60, 62, 47, 45, 47, 42, 126, 58, 32, 67, 103, 72, 72, 72, 104, 73, 73, 76, 108, 78, 80, 81, 82, 82, 82, 90, 90, 75, 66, 67, 101, 101, 69, 70, 77, 111, 105, 121, 68, 100, 101, 105, 106, 73, 86, 88, 76, 67, 68, 77, 105, 118, 120, 73, 99, 100, 45, 47, 92, 42, 73, 118, 85, 58, 126, 84, 118, 85, 69, 105, 112, 97, 73, 47, 88, 40, 41, 60, 62, 40, 41, 123, 125, 43, 45, 47, 92, 84, 120, 120, 92, 47, 92, 120, 114, 72, 73, 75, 77, 78, 79, 111, 80, 112, 67, 99, 84, 89, 88, 45, 47, 57, 51, 76, 54, 86, 69, 73, 33, 79, 81, 88, 61, 92, 47, 79, 40, 41, 47, 61, 47, 92, 92, 47, 66, 80, 100, 68, 84, 71, 75, 74, 67, 90, 70, 77, 78, 76, 83, 82, 86, 72, 87, 88, 89, 65, 69, 73, 79, 85, 46, 44, 58, 61, 46, 50, 105, 86, 63, 50, 115, 50, 51, 57, 38, 58, 96, 70, 102, 117, 51, 74, 88, 66, 101, 102, 111, 114, 114, 117, 117, 121, 105, 114, 119, 122, 118, 115, 99, 111, 111, 111, 111, 111, 111, 111, 111, 40, 41, 58, 95, 95, 95, 45, 92, 108, 108, 111, 111, 111, 111, 34, 36, 37, 38, 96, 42, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 60, 61, 62, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 73, 66, 69, 70, 124, 88, 79, 80, 83, 84, 43, 65, 66, 67, 70, 79, 77, 84, 89, 88, 72, 90, 66, 67, 124, 77, 84, 88, 56, 42, 108, 88, 79, 67, 76, 83, 111, 99, 115, 82, 79, 85, 55, 111, 117, 78, 79, 75, 67, 86, 70, 76, 88, 46, 79, 118, 119, 119, 119, 86, 70, 76, 89, 69, 90, 57, 69, 52, 76, 79, 85, 53, 84, 118, 115, 70, 105, 122, 55, 111, 51, 57, 54, 57, 111, 117, 121, 79, 90, 87, 67, 88, 87, 67, 86, 84, 76, 73, 82, 83, 51, 62, 65, 85, 89, 96, 96, 123, 46, 51, 86, 92, 55, 70, 82, 76, 60, 62, 47, 92, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 97, 98, 99, 100, 101, 102, 103, 105, 106, 107, 108, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 65, 67, 68, 71, 74, 75, 78, 79, 80, 81, 83, 84, 85, 86, 87, 88, 89, 90, 97, 98, 99, 100, 102, 104, 105, 106, 107, 108, 110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 65, 66, 68, 69, 70, 71, 74, 75, 76, 77, 78, 79, 80, 81, 83, 84, 85, 86, 87, 88, 89, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 73, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 65, 66, 68, 69, 70, 71, 73, 74, 75, 76, 77, 79, 83, 84, 85, 86, 87, 88, 89, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 73, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 73, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 73, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 73, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 73, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 73, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 73, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 105, 65, 66, 69, 90, 72, 73, 75, 77, 78, 79, 80, 84, 89, 88, 97, 121, 105, 118, 111, 112, 111, 117, 112, 65, 66, 69, 90, 72, 73, 75, 77, 78, 79, 80, 84, 89, 88, 97, 121, 105, 118, 111, 112, 111, 117, 112, 65, 66, 69, 90, 72, 73, 75, 77, 78, 79, 80, 84, 89, 88, 97, 121, 105, 118, 111, 112, 111, 117, 112, 65, 66, 69, 90, 72, 73, 75, 77, 78, 79, 80, 84, 89, 88, 97, 121, 105, 118, 111, 112, 111, 117, 112, 65, 66, 69, 90, 72, 73, 75, 77, 78, 79, 80, 84, 89, 88, 97, 121, 105, 118, 111, 112, 111, 117, 112, 70, 79, 73, 50, 51, 52, 53, 54, 55, 56, 57, 79, 73, 50, 51, 52, 53, 54, 55, 56, 57, 79, 73, 50, 51, 52, 53, 54, 55, 56, 57, 79, 73, 50, 51, 52, 53, 54, 55, 56, 57, 79, 73, 50, 51, 52, 53, 54, 55, 56, 57, 108, 56, 108, 111, 111, 108, 111, 67, 84, 79, 73, 50, 51, 52, 53, 54, 55, 56, 57}, + Locale: "_common", + RangeTable: &unicode.RangeTable{ + R16: []unicode.Range16{ + {Lo: 184, Hi: 383, Stride: 199}, + {Lo: 388, Hi: 397, Stride: 9}, + {Lo: 422, Hi: 423, Stride: 1}, + {Lo: 439, Hi: 444, Stride: 5}, + {Lo: 445, Hi: 451, Stride: 3}, + {Lo: 540, Hi: 546, Stride: 6}, + {Lo: 547, Hi: 577, Stride: 30}, + {Lo: 593, Hi: 609, Stride: 16}, + {Lo: 611, Hi: 617, Stride: 6}, + {Lo: 618, Hi: 623, Stride: 5}, + {Lo: 651, Hi: 655, Stride: 4}, + {Lo: 660, Hi: 697, Stride: 37}, + {Lo: 699, Hi: 702, Stride: 1}, + {Lo: 706, Hi: 708, Stride: 1}, + {Lo: 710, Hi: 714, Stride: 2}, + {Lo: 715, Hi: 720, Stride: 5}, + {Lo: 727, Hi: 731, Stride: 4}, + {Lo: 732, Hi: 756, Stride: 24}, + {Lo: 760, Hi: 884, Stride: 124}, + {Lo: 890, Hi: 894, Stride: 4}, + {Lo: 895, Hi: 900, Stride: 5}, + {Lo: 913, Hi: 914, Stride: 1}, + {Lo: 917, Hi: 919, Stride: 1}, + {Lo: 922, Hi: 924, Stride: 2}, + {Lo: 925, Hi: 929, Stride: 2}, + {Lo: 932, Hi: 933, Stride: 1}, + {Lo: 935, Hi: 945, Stride: 10}, + {Lo: 947, Hi: 953, Stride: 6}, + {Lo: 957, Hi: 965, Stride: 2}, + {Lo: 978, Hi: 988, Stride: 10}, + {Lo: 1000, Hi: 1010, Stride: 10}, + {Lo: 1011, Hi: 1017, Stride: 6}, + {Lo: 1018, Hi: 1029, Stride: 11}, + {Lo: 1030, Hi: 1032, Stride: 2}, + {Lo: 1109, Hi: 1110, Stride: 1}, + {Lo: 1112, Hi: 1121, Stride: 9}, + {Lo: 1140, Hi: 1141, Stride: 1}, + {Lo: 1198, Hi: 1199, Stride: 1}, + {Lo: 1211, Hi: 1213, Stride: 2}, + {Lo: 1216, Hi: 1231, Stride: 15}, + {Lo: 1248, Hi: 1281, Stride: 33}, + {Lo: 1292, Hi: 1307, Stride: 15}, + {Lo: 1308, Hi: 1309, Stride: 1}, + {Lo: 1357, Hi: 1359, Stride: 2}, + {Lo: 1365, Hi: 1370, Stride: 5}, + {Lo: 1373, Hi: 1377, Stride: 4}, + {Lo: 1379, Hi: 1382, Stride: 3}, + {Lo: 1392, Hi: 1400, Stride: 8}, + {Lo: 1404, Hi: 1405, Stride: 1}, + {Lo: 1409, Hi: 1412, Stride: 3}, + {Lo: 1413, Hi: 1417, Stride: 4}, + {Lo: 1472, Hi: 1475, Stride: 3}, + {Lo: 1493, Hi: 1496, Stride: 3}, + {Lo: 1497, Hi: 1503, Stride: 6}, + {Lo: 1505, Hi: 1523, Stride: 18}, + {Lo: 1549, Hi: 1575, Stride: 26}, + {Lo: 1607, Hi: 1632, Stride: 25}, + {Lo: 1633, Hi: 1637, Stride: 4}, + {Lo: 1639, Hi: 1643, Stride: 4}, + {Lo: 1645, Hi: 1726, Stride: 81}, + {Lo: 1729, Hi: 1748, Stride: 19}, + {Lo: 1749, Hi: 1776, Stride: 27}, + {Lo: 1777, Hi: 1781, Stride: 4}, + {Lo: 1783, Hi: 1793, Stride: 10}, + {Lo: 1794, Hi: 1796, Stride: 1}, + {Lo: 1984, Hi: 1994, Stride: 10}, + {Lo: 2036, Hi: 2037, Stride: 1}, + {Lo: 2042, Hi: 2307, Stride: 265}, + {Lo: 2406, Hi: 2429, Stride: 23}, + {Lo: 2534, Hi: 2538, Stride: 4}, + {Lo: 2541, Hi: 2662, Stride: 121}, + {Lo: 2663, Hi: 2666, Stride: 3}, + {Lo: 2691, Hi: 2790, Stride: 99}, + {Lo: 2819, Hi: 2848, Stride: 29}, + {Lo: 2918, Hi: 2920, Stride: 2}, + {Lo: 3046, Hi: 3074, Stride: 28}, + {Lo: 3174, Hi: 3202, Stride: 28}, + {Lo: 3302, Hi: 3330, Stride: 28}, + {Lo: 3360, Hi: 3430, Stride: 70}, + {Lo: 3437, Hi: 3458, Stride: 21}, + {Lo: 3664, Hi: 3792, Stride: 128}, + {Lo: 4125, Hi: 4160, Stride: 35}, + {Lo: 4327, Hi: 4351, Stride: 24}, + {Lo: 4608, Hi: 5024, Stride: 208}, + {Lo: 5025, Hi: 5026, Stride: 1}, + {Lo: 5029, Hi: 5033, Stride: 4}, + {Lo: 5034, Hi: 5036, Stride: 1}, + {Lo: 5038, Hi: 5043, Stride: 5}, + {Lo: 5047, Hi: 5051, Stride: 4}, + {Lo: 5053, Hi: 5056, Stride: 3}, + {Lo: 5058, Hi: 5059, Stride: 1}, + {Lo: 5070, Hi: 5071, Stride: 1}, + {Lo: 5074, Hi: 5076, Stride: 2}, + {Lo: 5077, Hi: 5081, Stride: 4}, + {Lo: 5082, Hi: 5086, Stride: 4}, + {Lo: 5087, Hi: 5090, Stride: 3}, + {Lo: 5094, Hi: 5095, Stride: 1}, + {Lo: 5102, Hi: 5107, Stride: 5}, + {Lo: 5108, Hi: 5120, Stride: 12}, + {Lo: 5167, Hi: 5171, Stride: 4}, + {Lo: 5176, Hi: 5194, Stride: 18}, + {Lo: 5196, Hi: 5229, Stride: 33}, + {Lo: 5231, Hi: 5234, Stride: 3}, + {Lo: 5261, Hi: 5290, Stride: 29}, + {Lo: 5311, Hi: 5441, Stride: 130}, + {Lo: 5500, Hi: 5501, Stride: 1}, + {Lo: 5511, Hi: 5551, Stride: 40}, + {Lo: 5556, Hi: 5573, Stride: 17}, + {Lo: 5598, Hi: 5610, Stride: 12}, + {Lo: 5616, Hi: 5623, Stride: 7}, + {Lo: 5741, Hi: 5742, Stride: 1}, + {Lo: 5760, Hi: 5810, Stride: 50}, + {Lo: 5815, Hi: 5825, Stride: 10}, + {Lo: 5836, Hi: 5845, Stride: 9}, + {Lo: 5846, Hi: 5868, Stride: 22}, + {Lo: 5869, Hi: 5941, Stride: 72}, + {Lo: 6147, Hi: 6153, Stride: 6}, + {Lo: 7428, Hi: 7439, Stride: 11}, + {Lo: 7441, Hi: 7452, Stride: 11}, + {Lo: 7456, Hi: 7458, Stride: 1}, + {Lo: 7462, Hi: 7555, Stride: 93}, + {Lo: 7564, Hi: 7837, Stride: 273}, + {Lo: 7935, Hi: 8125, Stride: 190}, + {Lo: 8126, Hi: 8128, Stride: 1}, + {Lo: 8175, Hi: 8189, Stride: 14}, + {Lo: 8190, Hi: 8192, Stride: 2}, + {Lo: 8193, Hi: 8202, Stride: 1}, + {Lo: 8208, Hi: 8210, Stride: 1}, + {Lo: 8218, Hi: 8219, Stride: 1}, + {Lo: 8228, Hi: 8232, Stride: 4}, + {Lo: 8233, Hi: 8239, Stride: 6}, + {Lo: 8242, Hi: 8249, Stride: 7}, + {Lo: 8250, Hi: 8257, Stride: 7}, + {Lo: 8259, Hi: 8260, Stride: 1}, + {Lo: 8270, Hi: 8275, Stride: 5}, + {Lo: 8282, Hi: 8287, Stride: 5}, + {Lo: 8450, Hi: 8458, Stride: 8}, + {Lo: 8459, Hi: 8462, Stride: 1}, + {Lo: 8464, Hi: 8467, Stride: 1}, + {Lo: 8469, Hi: 8473, Stride: 4}, + {Lo: 8474, Hi: 8477, Stride: 1}, + {Lo: 8484, Hi: 8488, Stride: 4}, + {Lo: 8490, Hi: 8492, Stride: 2}, + {Lo: 8493, Hi: 8497, Stride: 1}, + {Lo: 8499, Hi: 8500, Stride: 1}, + {Lo: 8505, Hi: 8509, Stride: 4}, + {Lo: 8517, Hi: 8521, Stride: 1}, + {Lo: 8544, Hi: 8548, Stride: 4}, + {Lo: 8553, Hi: 8556, Stride: 3}, + {Lo: 8557, Hi: 8560, Stride: 1}, + {Lo: 8564, Hi: 8569, Stride: 5}, + {Lo: 8572, Hi: 8574, Stride: 1}, + {Lo: 8722, Hi: 8725, Stride: 3}, + {Lo: 8726, Hi: 8727, Stride: 1}, + {Lo: 8739, Hi: 8744, Stride: 5}, + {Lo: 8746, Hi: 8758, Stride: 12}, + {Lo: 8764, Hi: 8868, Stride: 104}, + {Lo: 8897, Hi: 8899, Stride: 2}, + {Lo: 8959, Hi: 9075, Stride: 116}, + {Lo: 9076, Hi: 9082, Stride: 6}, + {Lo: 9213, Hi: 9585, Stride: 372}, + {Lo: 9587, Hi: 10088, Stride: 501}, + {Lo: 10089, Hi: 10094, Stride: 5}, + {Lo: 10095, Hi: 10098, Stride: 3}, + {Lo: 10099, Hi: 10101, Stride: 1}, + {Lo: 10133, Hi: 10134, Stride: 1}, + {Lo: 10187, Hi: 10189, Stride: 2}, + {Lo: 10201, Hi: 10539, Stride: 338}, + {Lo: 10540, Hi: 10741, Stride: 201}, + {Lo: 10744, Hi: 10745, Stride: 1}, + {Lo: 10799, Hi: 11397, Stride: 598}, + {Lo: 11406, Hi: 11410, Stride: 4}, + {Lo: 11412, Hi: 11416, Stride: 4}, + {Lo: 11418, Hi: 11422, Stride: 4}, + {Lo: 11423, Hi: 11426, Stride: 3}, + {Lo: 11427, Hi: 11430, Stride: 1}, + {Lo: 11432, Hi: 11436, Stride: 4}, + {Lo: 11450, Hi: 11462, Stride: 12}, + {Lo: 11466, Hi: 11468, Stride: 2}, + {Lo: 11472, Hi: 11474, Stride: 2}, + {Lo: 11576, Hi: 11577, Stride: 1}, + {Lo: 11599, Hi: 11601, Stride: 2}, + {Lo: 11604, Hi: 11605, Stride: 1}, + {Lo: 11613, Hi: 11840, Stride: 227}, + {Lo: 12034, Hi: 12035, Stride: 1}, + {Lo: 12295, Hi: 12308, Stride: 13}, + {Lo: 12309, Hi: 12339, Stride: 30}, + {Lo: 12448, Hi: 12755, Stride: 307}, + {Lo: 12756, Hi: 20022, Stride: 7266}, + {Lo: 20031, Hi: 42192, Stride: 22161}, + {Lo: 42193, Hi: 42196, Stride: 1}, + {Lo: 42198, Hi: 42199, Stride: 1}, + {Lo: 42201, Hi: 42202, Stride: 1}, + {Lo: 42204, Hi: 42205, Stride: 1}, + {Lo: 42207, Hi: 42211, Stride: 1}, + {Lo: 42214, Hi: 42215, Stride: 1}, + {Lo: 42218, Hi: 42220, Stride: 1}, + {Lo: 42222, Hi: 42226, Stride: 2}, + {Lo: 42227, Hi: 42228, Stride: 1}, + {Lo: 42232, Hi: 42233, Stride: 1}, + {Lo: 42237, Hi: 42239, Stride: 2}, + {Lo: 42510, Hi: 42564, Stride: 54}, + {Lo: 42567, Hi: 42719, Stride: 152}, + {Lo: 42731, Hi: 42735, Stride: 4}, + {Lo: 42801, Hi: 42842, Stride: 41}, + {Lo: 42858, Hi: 42862, Stride: 4}, + {Lo: 42872, Hi: 42889, Stride: 17}, + {Lo: 42892, Hi: 42904, Stride: 12}, + {Lo: 42905, Hi: 42911, Stride: 6}, + {Lo: 42923, Hi: 42930, Stride: 7}, + {Lo: 42931, Hi: 42932, Stride: 1}, + {Lo: 43826, Hi: 43829, Stride: 3}, + {Lo: 43837, Hi: 43847, Stride: 10}, + {Lo: 43848, Hi: 43854, Stride: 6}, + {Lo: 43858, Hi: 43866, Stride: 8}, + {Lo: 43893, Hi: 43905, Stride: 12}, + {Lo: 43907, Hi: 43923, Stride: 16}, + {Lo: 43945, Hi: 43946, Stride: 1}, + {Lo: 43951, Hi: 64422, Stride: 20471}, + {Lo: 64423, Hi: 64429, Stride: 1}, + {Lo: 64830, Hi: 64831, Stride: 1}, + {Lo: 65072, Hi: 65101, Stride: 29}, + {Lo: 65102, Hi: 65103, Stride: 1}, + {Lo: 65112, Hi: 65128, Stride: 16}, + {Lo: 65165, Hi: 65166, Stride: 1}, + {Lo: 65257, Hi: 65260, Stride: 1}, + {Lo: 65282, Hi: 65284, Stride: 2}, + {Lo: 65285, Hi: 65287, Stride: 1}, + {Lo: 65290, Hi: 65291, Stride: 1}, + {Lo: 65293, Hi: 65305, Stride: 1}, + {Lo: 65308, Hi: 65310, Stride: 1}, + {Lo: 65312, Hi: 65373, Stride: 1}, + {Lo: 65512, Hi: 65512, Stride: 1}, + }, + R32: []unicode.Range32{ + {Lo: 66178, Hi: 66182, Stride: 4}, + {Lo: 66183, Hi: 66186, Stride: 3}, + {Lo: 66192, Hi: 66194, Stride: 2}, + {Lo: 66197, Hi: 66199, Stride: 1}, + {Lo: 66203, Hi: 66208, Stride: 5}, + {Lo: 66209, Hi: 66210, Stride: 1}, + {Lo: 66213, Hi: 66219, Stride: 6}, + {Lo: 66224, Hi: 66226, Stride: 1}, + {Lo: 66228, Hi: 66255, Stride: 27}, + {Lo: 66293, Hi: 66305, Stride: 12}, + {Lo: 66306, Hi: 66313, Stride: 7}, + {Lo: 66321, Hi: 66325, Stride: 4}, + {Lo: 66327, Hi: 66330, Stride: 3}, + {Lo: 66335, Hi: 66336, Stride: 1}, + {Lo: 66338, Hi: 66564, Stride: 226}, + {Lo: 66581, Hi: 66587, Stride: 6}, + {Lo: 66592, Hi: 66604, Stride: 12}, + {Lo: 66621, Hi: 66632, Stride: 11}, + {Lo: 66740, Hi: 66754, Stride: 14}, + {Lo: 66766, Hi: 66770, Stride: 4}, + {Lo: 66794, Hi: 66806, Stride: 12}, + {Lo: 66835, Hi: 66838, Stride: 3}, + {Lo: 66840, Hi: 66844, Stride: 4}, + {Lo: 66845, Hi: 66853, Stride: 8}, + {Lo: 66854, Hi: 66855, Stride: 1}, + {Lo: 68176, Hi: 70864, Stride: 2688}, + {Lo: 71430, Hi: 71438, Stride: 4}, + {Lo: 71439, Hi: 71840, Stride: 401}, + {Lo: 71842, Hi: 71844, Stride: 1}, + {Lo: 71846, Hi: 71852, Stride: 3}, + {Lo: 71854, Hi: 71855, Stride: 1}, + {Lo: 71858, Hi: 71867, Stride: 3}, + {Lo: 71868, Hi: 71872, Stride: 4}, + {Lo: 71873, Hi: 71876, Stride: 1}, + {Lo: 71878, Hi: 71884, Stride: 2}, + {Lo: 71893, Hi: 71896, Stride: 1}, + {Lo: 71900, Hi: 71904, Stride: 4}, + {Lo: 71909, Hi: 71910, Stride: 1}, + {Lo: 71913, Hi: 71922, Stride: 3}, + {Lo: 93960, Hi: 93962, Stride: 2}, + {Lo: 93974, Hi: 93992, Stride: 18}, + {Lo: 94005, Hi: 94010, Stride: 5}, + {Lo: 94011, Hi: 94015, Stride: 4}, + {Lo: 94016, Hi: 94018, Stride: 2}, + {Lo: 94019, Hi: 94033, Stride: 14}, + {Lo: 94034, Hi: 119060, Stride: 25026}, + {Lo: 119149, Hi: 119302, Stride: 153}, + {Lo: 119309, Hi: 119311, Stride: 2}, + {Lo: 119314, Hi: 119315, Stride: 1}, + {Lo: 119318, Hi: 119338, Stride: 20}, + {Lo: 119350, Hi: 119351, Stride: 1}, + {Lo: 119354, Hi: 119355, Stride: 1}, + {Lo: 119808, Hi: 119845, Stride: 1}, + {Lo: 119847, Hi: 119892, Stride: 1}, + {Lo: 119894, Hi: 119897, Stride: 1}, + {Lo: 119899, Hi: 119949, Stride: 1}, + {Lo: 119951, Hi: 119964, Stride: 1}, + {Lo: 119966, Hi: 119967, Stride: 1}, + {Lo: 119970, Hi: 119973, Stride: 3}, + {Lo: 119974, Hi: 119977, Stride: 3}, + {Lo: 119978, Hi: 119980, Stride: 1}, + {Lo: 119982, Hi: 119993, Stride: 1}, + {Lo: 119995, Hi: 119997, Stride: 2}, + {Lo: 119998, Hi: 120001, Stride: 1}, + {Lo: 120003, Hi: 120005, Stride: 2}, + {Lo: 120006, Hi: 120053, Stride: 1}, + {Lo: 120055, Hi: 120069, Stride: 1}, + {Lo: 120071, Hi: 120074, Stride: 1}, + {Lo: 120077, Hi: 120084, Stride: 1}, + {Lo: 120086, Hi: 120092, Stride: 1}, + {Lo: 120094, Hi: 120105, Stride: 1}, + {Lo: 120107, Hi: 120121, Stride: 1}, + {Lo: 120123, Hi: 120126, Stride: 1}, + {Lo: 120128, Hi: 120132, Stride: 1}, + {Lo: 120134, Hi: 120138, Stride: 4}, + {Lo: 120139, Hi: 120144, Stride: 1}, + {Lo: 120146, Hi: 120157, Stride: 1}, + {Lo: 120159, Hi: 120209, Stride: 1}, + {Lo: 120211, Hi: 120261, Stride: 1}, + {Lo: 120263, Hi: 120313, Stride: 1}, + {Lo: 120315, Hi: 120365, Stride: 1}, + {Lo: 120367, Hi: 120417, Stride: 1}, + {Lo: 120419, Hi: 120469, Stride: 1}, + {Lo: 120471, Hi: 120484, Stride: 1}, + {Lo: 120488, Hi: 120489, Stride: 1}, + {Lo: 120492, Hi: 120494, Stride: 1}, + {Lo: 120496, Hi: 120497, Stride: 1}, + {Lo: 120499, Hi: 120500, Stride: 1}, + {Lo: 120502, Hi: 120504, Stride: 2}, + {Lo: 120507, Hi: 120508, Stride: 1}, + {Lo: 120510, Hi: 120514, Stride: 4}, + {Lo: 120516, Hi: 120522, Stride: 6}, + {Lo: 120526, Hi: 120534, Stride: 2}, + {Lo: 120544, Hi: 120546, Stride: 2}, + {Lo: 120547, Hi: 120550, Stride: 3}, + {Lo: 120551, Hi: 120552, Stride: 1}, + {Lo: 120554, Hi: 120555, Stride: 1}, + {Lo: 120557, Hi: 120558, Stride: 1}, + {Lo: 120560, Hi: 120562, Stride: 2}, + {Lo: 120565, Hi: 120566, Stride: 1}, + {Lo: 120568, Hi: 120572, Stride: 4}, + {Lo: 120574, Hi: 120580, Stride: 6}, + {Lo: 120584, Hi: 120592, Stride: 2}, + {Lo: 120602, Hi: 120604, Stride: 2}, + {Lo: 120605, Hi: 120608, Stride: 3}, + {Lo: 120609, Hi: 120610, Stride: 1}, + {Lo: 120612, Hi: 120613, Stride: 1}, + {Lo: 120615, Hi: 120616, Stride: 1}, + {Lo: 120618, Hi: 120620, Stride: 2}, + {Lo: 120623, Hi: 120624, Stride: 1}, + {Lo: 120626, Hi: 120630, Stride: 4}, + {Lo: 120632, Hi: 120638, Stride: 6}, + {Lo: 120642, Hi: 120650, Stride: 2}, + {Lo: 120660, Hi: 120662, Stride: 2}, + {Lo: 120663, Hi: 120666, Stride: 3}, + {Lo: 120667, Hi: 120668, Stride: 1}, + {Lo: 120670, Hi: 120671, Stride: 1}, + {Lo: 120673, Hi: 120674, Stride: 1}, + {Lo: 120676, Hi: 120678, Stride: 2}, + {Lo: 120681, Hi: 120682, Stride: 1}, + {Lo: 120684, Hi: 120688, Stride: 4}, + {Lo: 120690, Hi: 120696, Stride: 6}, + {Lo: 120700, Hi: 120708, Stride: 2}, + {Lo: 120718, Hi: 120720, Stride: 2}, + {Lo: 120721, Hi: 120724, Stride: 3}, + {Lo: 120725, Hi: 120726, Stride: 1}, + {Lo: 120728, Hi: 120729, Stride: 1}, + {Lo: 120731, Hi: 120732, Stride: 1}, + {Lo: 120734, Hi: 120736, Stride: 2}, + {Lo: 120739, Hi: 120740, Stride: 1}, + {Lo: 120742, Hi: 120746, Stride: 4}, + {Lo: 120748, Hi: 120754, Stride: 6}, + {Lo: 120758, Hi: 120766, Stride: 2}, + {Lo: 120776, Hi: 120778, Stride: 2}, + {Lo: 120782, Hi: 120831, Stride: 1}, + {Lo: 125127, Hi: 125131, Stride: 4}, + {Lo: 126464, Hi: 126500, Stride: 36}, + {Lo: 126564, Hi: 126592, Stride: 28}, + {Lo: 126596, Hi: 128844, Stride: 2248}, + {Lo: 128872, Hi: 130032, Stride: 1160}, + {Lo: 130033, Hi: 130041, Stride: 1}, + }, + LatinOffset: 0, + }, + }, + "_default": { + Confusable: []rune{160, 180, 215, 305, 921, 1009, 1040, 1042, 1045, 1047, 1050, 1052, 1053, 1054, 1056, 1057, 1058, 1059, 1061, 1068, 1072, 1073, 1075, 1077, 1086, 1088, 1089, 1091, 1093, 8211, 8216, 8217, 8245, 12494, 65281, 65283, 65288, 65289, 65292, 65306, 65307, 65311, 65374}, + With: []rune{32, 96, 120, 105, 73, 112, 65, 66, 69, 51, 75, 77, 72, 79, 80, 67, 84, 89, 88, 98, 97, 54, 114, 101, 111, 112, 99, 121, 120, 45, 96, 96, 96, 47, 33, 35, 40, 41, 44, 58, 59, 63, 126}, + Locale: "_default", + RangeTable: &unicode.RangeTable{ + R16: []unicode.Range16{ + {Lo: 160, Hi: 180, Stride: 20}, + {Lo: 215, Hi: 305, Stride: 90}, + {Lo: 921, Hi: 1009, Stride: 88}, + {Lo: 1040, Hi: 1042, Stride: 2}, + {Lo: 1045, Hi: 1047, Stride: 2}, + {Lo: 1050, Hi: 1052, Stride: 2}, + {Lo: 1053, Hi: 1054, Stride: 1}, + {Lo: 1056, Hi: 1059, Stride: 1}, + {Lo: 1061, Hi: 1068, Stride: 7}, + {Lo: 1072, Hi: 1073, Stride: 1}, + {Lo: 1075, Hi: 1077, Stride: 2}, + {Lo: 1086, Hi: 1088, Stride: 2}, + {Lo: 1089, Hi: 1093, Stride: 2}, + {Lo: 8211, Hi: 8216, Stride: 5}, + {Lo: 8217, Hi: 8245, Stride: 28}, + {Lo: 12494, Hi: 65281, Stride: 52787}, + {Lo: 65283, Hi: 65288, Stride: 5}, + {Lo: 65289, Hi: 65292, Stride: 3}, + {Lo: 65306, Hi: 65307, Stride: 1}, + {Lo: 65311, Hi: 65374, Stride: 63}, + }, + R32: []unicode.Range32{}, + LatinOffset: 1, + }, + }, + "cs": { + Confusable: []rune{180, 305, 921, 1009, 1040, 1042, 1045, 1047, 1050, 1052, 1053, 1054, 1056, 1057, 1058, 1059, 1061, 1068, 1072, 1073, 1075, 1077, 1086, 1088, 1089, 1091, 1093, 8216, 8217, 8245, 12494, 65281, 65283, 65288, 65289, 65292, 65306, 65307, 65311, 65374}, + With: []rune{96, 105, 73, 112, 65, 66, 69, 51, 75, 77, 72, 79, 80, 67, 84, 89, 88, 98, 97, 54, 114, 101, 111, 112, 99, 121, 120, 96, 96, 96, 47, 33, 35, 40, 41, 44, 58, 59, 63, 126}, + Locale: "cs", + RangeTable: &unicode.RangeTable{ + R16: []unicode.Range16{ + {Lo: 180, Hi: 305, Stride: 125}, + {Lo: 921, Hi: 1009, Stride: 88}, + {Lo: 1040, Hi: 1042, Stride: 2}, + {Lo: 1045, Hi: 1047, Stride: 2}, + {Lo: 1050, Hi: 1052, Stride: 2}, + {Lo: 1053, Hi: 1054, Stride: 1}, + {Lo: 1056, Hi: 1059, Stride: 1}, + {Lo: 1061, Hi: 1068, Stride: 7}, + {Lo: 1072, Hi: 1073, Stride: 1}, + {Lo: 1075, Hi: 1077, Stride: 2}, + {Lo: 1086, Hi: 1088, Stride: 2}, + {Lo: 1089, Hi: 1093, Stride: 2}, + {Lo: 8216, Hi: 8217, Stride: 1}, + {Lo: 8245, Hi: 12494, Stride: 4249}, + {Lo: 65281, Hi: 65283, Stride: 2}, + {Lo: 65288, Hi: 65289, Stride: 1}, + {Lo: 65292, Hi: 65306, Stride: 14}, + {Lo: 65307, Hi: 65311, Stride: 4}, + {Lo: 65374, Hi: 65374, Stride: 1}, + }, + R32: []unicode.Range32{}, + LatinOffset: 0, + }, + }, + "de": { + Confusable: []rune{180, 305, 921, 1009, 1040, 1042, 1045, 1047, 1050, 1052, 1053, 1054, 1056, 1057, 1058, 1059, 1061, 1068, 1072, 1073, 1075, 1077, 1086, 1088, 1089, 1091, 1093, 8216, 8217, 8245, 12494, 65281, 65283, 65288, 65289, 65292, 65306, 65307, 65311, 65374}, + With: []rune{96, 105, 73, 112, 65, 66, 69, 51, 75, 77, 72, 79, 80, 67, 84, 89, 88, 98, 97, 54, 114, 101, 111, 112, 99, 121, 120, 96, 96, 96, 47, 33, 35, 40, 41, 44, 58, 59, 63, 126}, + Locale: "de", + RangeTable: &unicode.RangeTable{ + R16: []unicode.Range16{ + {Lo: 180, Hi: 305, Stride: 125}, + {Lo: 921, Hi: 1009, Stride: 88}, + {Lo: 1040, Hi: 1042, Stride: 2}, + {Lo: 1045, Hi: 1047, Stride: 2}, + {Lo: 1050, Hi: 1052, Stride: 2}, + {Lo: 1053, Hi: 1054, Stride: 1}, + {Lo: 1056, Hi: 1059, Stride: 1}, + {Lo: 1061, Hi: 1068, Stride: 7}, + {Lo: 1072, Hi: 1073, Stride: 1}, + {Lo: 1075, Hi: 1077, Stride: 2}, + {Lo: 1086, Hi: 1088, Stride: 2}, + {Lo: 1089, Hi: 1093, Stride: 2}, + {Lo: 8216, Hi: 8217, Stride: 1}, + {Lo: 8245, Hi: 12494, Stride: 4249}, + {Lo: 65281, Hi: 65283, Stride: 2}, + {Lo: 65288, Hi: 65289, Stride: 1}, + {Lo: 65292, Hi: 65306, Stride: 14}, + {Lo: 65307, Hi: 65311, Stride: 4}, + {Lo: 65374, Hi: 65374, Stride: 1}, + }, + R32: []unicode.Range32{}, + LatinOffset: 0, + }, + }, + "es": { + Confusable: []rune{180, 215, 305, 1009, 1040, 1042, 1045, 1047, 1050, 1052, 1053, 1054, 1056, 1057, 1058, 1059, 1061, 1068, 1072, 1073, 1075, 1077, 1086, 1088, 1089, 1091, 1093, 8211, 8245, 12494, 65281, 65283, 65288, 65289, 65292, 65306, 65307, 65311, 65374}, + With: []rune{96, 120, 105, 112, 65, 66, 69, 51, 75, 77, 72, 79, 80, 67, 84, 89, 88, 98, 97, 54, 114, 101, 111, 112, 99, 121, 120, 45, 96, 47, 33, 35, 40, 41, 44, 58, 59, 63, 126}, + Locale: "es", + RangeTable: &unicode.RangeTable{ + R16: []unicode.Range16{ + {Lo: 180, Hi: 215, Stride: 35}, + {Lo: 305, Hi: 1009, Stride: 704}, + {Lo: 1040, Hi: 1042, Stride: 2}, + {Lo: 1045, Hi: 1047, Stride: 2}, + {Lo: 1050, Hi: 1052, Stride: 2}, + {Lo: 1053, Hi: 1054, Stride: 1}, + {Lo: 1056, Hi: 1059, Stride: 1}, + {Lo: 1061, Hi: 1068, Stride: 7}, + {Lo: 1072, Hi: 1073, Stride: 1}, + {Lo: 1075, Hi: 1077, Stride: 2}, + {Lo: 1086, Hi: 1088, Stride: 2}, + {Lo: 1089, Hi: 1093, Stride: 2}, + {Lo: 8211, Hi: 8245, Stride: 34}, + {Lo: 12494, Hi: 65281, Stride: 52787}, + {Lo: 65283, Hi: 65288, Stride: 5}, + {Lo: 65289, Hi: 65292, Stride: 3}, + {Lo: 65306, Hi: 65307, Stride: 1}, + {Lo: 65311, Hi: 65374, Stride: 63}, + }, + R32: []unicode.Range32{}, + LatinOffset: 1, + }, + }, + "fr": { + Confusable: []rune{215, 305, 921, 1009, 1040, 1042, 1045, 1047, 1050, 1052, 1053, 1054, 1056, 1057, 1058, 1059, 1061, 1068, 1072, 1073, 1075, 1077, 1086, 1088, 1089, 1091, 1093, 8216, 8245, 12494, 65281, 65283, 65288, 65289, 65292, 65306, 65307, 65311, 65374}, + With: []rune{120, 105, 73, 112, 65, 66, 69, 51, 75, 77, 72, 79, 80, 67, 84, 89, 88, 98, 97, 54, 114, 101, 111, 112, 99, 121, 120, 96, 96, 47, 33, 35, 40, 41, 44, 58, 59, 63, 126}, + Locale: "fr", + RangeTable: &unicode.RangeTable{ + R16: []unicode.Range16{ + {Lo: 215, Hi: 305, Stride: 90}, + {Lo: 921, Hi: 1009, Stride: 88}, + {Lo: 1040, Hi: 1042, Stride: 2}, + {Lo: 1045, Hi: 1047, Stride: 2}, + {Lo: 1050, Hi: 1052, Stride: 2}, + {Lo: 1053, Hi: 1054, Stride: 1}, + {Lo: 1056, Hi: 1059, Stride: 1}, + {Lo: 1061, Hi: 1068, Stride: 7}, + {Lo: 1072, Hi: 1073, Stride: 1}, + {Lo: 1075, Hi: 1077, Stride: 2}, + {Lo: 1086, Hi: 1088, Stride: 2}, + {Lo: 1089, Hi: 1093, Stride: 2}, + {Lo: 8216, Hi: 8245, Stride: 29}, + {Lo: 12494, Hi: 65281, Stride: 52787}, + {Lo: 65283, Hi: 65288, Stride: 5}, + {Lo: 65289, Hi: 65292, Stride: 3}, + {Lo: 65306, Hi: 65307, Stride: 1}, + {Lo: 65311, Hi: 65374, Stride: 63}, + }, + R32: []unicode.Range32{}, + LatinOffset: 0, + }, + }, + "it": { + Confusable: []rune{160, 180, 215, 305, 921, 1009, 1040, 1042, 1045, 1047, 1050, 1052, 1053, 1054, 1056, 1057, 1058, 1059, 1061, 1068, 1072, 1073, 1075, 1077, 1086, 1088, 1089, 1091, 1093, 8211, 8216, 8245, 12494, 65281, 65283, 65288, 65289, 65292, 65306, 65307, 65311, 65374}, + With: []rune{32, 96, 120, 105, 73, 112, 65, 66, 69, 51, 75, 77, 72, 79, 80, 67, 84, 89, 88, 98, 97, 54, 114, 101, 111, 112, 99, 121, 120, 45, 96, 96, 47, 33, 35, 40, 41, 44, 58, 59, 63, 126}, + Locale: "it", + RangeTable: &unicode.RangeTable{ + R16: []unicode.Range16{ + {Lo: 160, Hi: 180, Stride: 20}, + {Lo: 215, Hi: 305, Stride: 90}, + {Lo: 921, Hi: 1009, Stride: 88}, + {Lo: 1040, Hi: 1042, Stride: 2}, + {Lo: 1045, Hi: 1047, Stride: 2}, + {Lo: 1050, Hi: 1052, Stride: 2}, + {Lo: 1053, Hi: 1054, Stride: 1}, + {Lo: 1056, Hi: 1059, Stride: 1}, + {Lo: 1061, Hi: 1068, Stride: 7}, + {Lo: 1072, Hi: 1073, Stride: 1}, + {Lo: 1075, Hi: 1077, Stride: 2}, + {Lo: 1086, Hi: 1088, Stride: 2}, + {Lo: 1089, Hi: 1093, Stride: 2}, + {Lo: 8211, Hi: 8216, Stride: 5}, + {Lo: 8245, Hi: 12494, Stride: 4249}, + {Lo: 65281, Hi: 65283, Stride: 2}, + {Lo: 65288, Hi: 65289, Stride: 1}, + {Lo: 65292, Hi: 65306, Stride: 14}, + {Lo: 65307, Hi: 65311, Stride: 4}, + {Lo: 65374, Hi: 65374, Stride: 1}, + }, + R32: []unicode.Range32{}, + LatinOffset: 1, + }, + }, + "ja": { + Confusable: []rune{180, 215, 305, 921, 1009, 1040, 1042, 1045, 1047, 1050, 1052, 1053, 1054, 1056, 1057, 1058, 1059, 1061, 1068, 1072, 1073, 1075, 1077, 1086, 1088, 1089, 1091, 1093, 8211, 8216, 8217, 8245, 65281, 65283, 65292, 65306, 65307}, + With: []rune{96, 120, 105, 73, 112, 65, 66, 69, 51, 75, 77, 72, 79, 80, 67, 84, 89, 88, 98, 97, 54, 114, 101, 111, 112, 99, 121, 120, 45, 96, 96, 96, 33, 35, 44, 58, 59}, + Locale: "ja", + RangeTable: &unicode.RangeTable{ + R16: []unicode.Range16{ + {Lo: 180, Hi: 215, Stride: 35}, + {Lo: 305, Hi: 921, Stride: 616}, + {Lo: 1009, Hi: 1040, Stride: 31}, + {Lo: 1042, Hi: 1045, Stride: 3}, + {Lo: 1047, Hi: 1050, Stride: 3}, + {Lo: 1052, Hi: 1054, Stride: 1}, + {Lo: 1056, Hi: 1059, Stride: 1}, + {Lo: 1061, Hi: 1068, Stride: 7}, + {Lo: 1072, Hi: 1073, Stride: 1}, + {Lo: 1075, Hi: 1077, Stride: 2}, + {Lo: 1086, Hi: 1088, Stride: 2}, + {Lo: 1089, Hi: 1093, Stride: 2}, + {Lo: 8211, Hi: 8216, Stride: 5}, + {Lo: 8217, Hi: 8245, Stride: 28}, + {Lo: 65281, Hi: 65283, Stride: 2}, + {Lo: 65292, Hi: 65306, Stride: 14}, + {Lo: 65307, Hi: 65307, Stride: 1}, + }, + R32: []unicode.Range32{}, + LatinOffset: 1, + }, + }, + "ko": { + Confusable: []rune{180, 215, 305, 921, 1009, 1040, 1042, 1045, 1047, 1050, 1052, 1053, 1054, 1056, 1057, 1058, 1059, 1061, 1068, 1072, 1073, 1075, 1077, 1086, 1088, 1089, 1091, 1093, 8211, 8245, 12494, 65281, 65283, 65288, 65289, 65292, 65306, 65307, 65311, 65374}, + With: []rune{96, 120, 105, 73, 112, 65, 66, 69, 51, 75, 77, 72, 79, 80, 67, 84, 89, 88, 98, 97, 54, 114, 101, 111, 112, 99, 121, 120, 45, 96, 47, 33, 35, 40, 41, 44, 58, 59, 63, 126}, + Locale: "ko", + RangeTable: &unicode.RangeTable{ + R16: []unicode.Range16{ + {Lo: 180, Hi: 215, Stride: 35}, + {Lo: 305, Hi: 921, Stride: 616}, + {Lo: 1009, Hi: 1040, Stride: 31}, + {Lo: 1042, Hi: 1045, Stride: 3}, + {Lo: 1047, Hi: 1050, Stride: 3}, + {Lo: 1052, Hi: 1054, Stride: 1}, + {Lo: 1056, Hi: 1059, Stride: 1}, + {Lo: 1061, Hi: 1068, Stride: 7}, + {Lo: 1072, Hi: 1073, Stride: 1}, + {Lo: 1075, Hi: 1077, Stride: 2}, + {Lo: 1086, Hi: 1088, Stride: 2}, + {Lo: 1089, Hi: 1093, Stride: 2}, + {Lo: 8211, Hi: 8245, Stride: 34}, + {Lo: 12494, Hi: 65281, Stride: 52787}, + {Lo: 65283, Hi: 65288, Stride: 5}, + {Lo: 65289, Hi: 65292, Stride: 3}, + {Lo: 65306, Hi: 65307, Stride: 1}, + {Lo: 65311, Hi: 65374, Stride: 63}, + }, + R32: []unicode.Range32{}, + LatinOffset: 1, + }, + }, + "pl": { + Confusable: []rune{180, 215, 305, 921, 1009, 1040, 1042, 1045, 1047, 1050, 1052, 1053, 1054, 1056, 1057, 1058, 1059, 1061, 1068, 1072, 1073, 1075, 1077, 1086, 1088, 1089, 1091, 1093, 8216, 8217, 8245, 12494, 65281, 65283, 65288, 65289, 65292, 65306, 65307, 65311, 65374}, + With: []rune{96, 120, 105, 73, 112, 65, 66, 69, 51, 75, 77, 72, 79, 80, 67, 84, 89, 88, 98, 97, 54, 114, 101, 111, 112, 99, 121, 120, 96, 96, 96, 47, 33, 35, 40, 41, 44, 58, 59, 63, 126}, + Locale: "pl", + RangeTable: &unicode.RangeTable{ + R16: []unicode.Range16{ + {Lo: 180, Hi: 215, Stride: 35}, + {Lo: 305, Hi: 921, Stride: 616}, + {Lo: 1009, Hi: 1040, Stride: 31}, + {Lo: 1042, Hi: 1045, Stride: 3}, + {Lo: 1047, Hi: 1050, Stride: 3}, + {Lo: 1052, Hi: 1054, Stride: 1}, + {Lo: 1056, Hi: 1059, Stride: 1}, + {Lo: 1061, Hi: 1068, Stride: 7}, + {Lo: 1072, Hi: 1073, Stride: 1}, + {Lo: 1075, Hi: 1077, Stride: 2}, + {Lo: 1086, Hi: 1088, Stride: 2}, + {Lo: 1089, Hi: 1093, Stride: 2}, + {Lo: 8216, Hi: 8217, Stride: 1}, + {Lo: 8245, Hi: 12494, Stride: 4249}, + {Lo: 65281, Hi: 65283, Stride: 2}, + {Lo: 65288, Hi: 65289, Stride: 1}, + {Lo: 65292, Hi: 65306, Stride: 14}, + {Lo: 65307, Hi: 65311, Stride: 4}, + {Lo: 65374, Hi: 65374, Stride: 1}, + }, + R32: []unicode.Range32{}, + LatinOffset: 1, + }, + }, + "pt-BR": { + Confusable: []rune{180, 215, 305, 921, 1009, 1040, 1042, 1045, 1047, 1050, 1052, 1053, 1054, 1056, 1057, 1058, 1059, 1061, 1068, 1072, 1073, 1075, 1077, 1086, 1088, 1089, 1091, 1093, 8216, 8217, 8245, 12494, 65281, 65283, 65288, 65289, 65292, 65306, 65307, 65311, 65374}, + With: []rune{96, 120, 105, 73, 112, 65, 66, 69, 51, 75, 77, 72, 79, 80, 67, 84, 89, 88, 98, 97, 54, 114, 101, 111, 112, 99, 121, 120, 96, 96, 96, 47, 33, 35, 40, 41, 44, 58, 59, 63, 126}, + Locale: "pt-BR", + RangeTable: &unicode.RangeTable{ + R16: []unicode.Range16{ + {Lo: 180, Hi: 215, Stride: 35}, + {Lo: 305, Hi: 921, Stride: 616}, + {Lo: 1009, Hi: 1040, Stride: 31}, + {Lo: 1042, Hi: 1045, Stride: 3}, + {Lo: 1047, Hi: 1050, Stride: 3}, + {Lo: 1052, Hi: 1054, Stride: 1}, + {Lo: 1056, Hi: 1059, Stride: 1}, + {Lo: 1061, Hi: 1068, Stride: 7}, + {Lo: 1072, Hi: 1073, Stride: 1}, + {Lo: 1075, Hi: 1077, Stride: 2}, + {Lo: 1086, Hi: 1088, Stride: 2}, + {Lo: 1089, Hi: 1093, Stride: 2}, + {Lo: 8216, Hi: 8217, Stride: 1}, + {Lo: 8245, Hi: 12494, Stride: 4249}, + {Lo: 65281, Hi: 65283, Stride: 2}, + {Lo: 65288, Hi: 65289, Stride: 1}, + {Lo: 65292, Hi: 65306, Stride: 14}, + {Lo: 65307, Hi: 65311, Stride: 4}, + {Lo: 65374, Hi: 65374, Stride: 1}, + }, + R32: []unicode.Range32{}, + LatinOffset: 1, + }, + }, + "qps-ploc": { + Confusable: []rune{160, 180, 215, 305, 921, 1040, 1042, 1045, 1047, 1050, 1052, 1053, 1054, 1056, 1057, 1058, 1059, 1061, 1068, 1072, 1073, 1075, 1077, 1086, 1088, 1089, 1091, 1093, 8211, 8216, 8217, 8245, 12494, 65281, 65283, 65288, 65289, 65292, 65306, 65307, 65311, 65374}, + With: []rune{32, 96, 120, 105, 73, 65, 66, 69, 51, 75, 77, 72, 79, 80, 67, 84, 89, 88, 98, 97, 54, 114, 101, 111, 112, 99, 121, 120, 45, 96, 96, 96, 47, 33, 35, 40, 41, 44, 58, 59, 63, 126}, + Locale: "qps-ploc", + RangeTable: &unicode.RangeTable{ + R16: []unicode.Range16{ + {Lo: 160, Hi: 180, Stride: 20}, + {Lo: 215, Hi: 305, Stride: 90}, + {Lo: 921, Hi: 1040, Stride: 119}, + {Lo: 1042, Hi: 1045, Stride: 3}, + {Lo: 1047, Hi: 1050, Stride: 3}, + {Lo: 1052, Hi: 1054, Stride: 1}, + {Lo: 1056, Hi: 1059, Stride: 1}, + {Lo: 1061, Hi: 1068, Stride: 7}, + {Lo: 1072, Hi: 1073, Stride: 1}, + {Lo: 1075, Hi: 1077, Stride: 2}, + {Lo: 1086, Hi: 1088, Stride: 2}, + {Lo: 1089, Hi: 1093, Stride: 2}, + {Lo: 8211, Hi: 8216, Stride: 5}, + {Lo: 8217, Hi: 8245, Stride: 28}, + {Lo: 12494, Hi: 65281, Stride: 52787}, + {Lo: 65283, Hi: 65288, Stride: 5}, + {Lo: 65289, Hi: 65292, Stride: 3}, + {Lo: 65306, Hi: 65307, Stride: 1}, + {Lo: 65311, Hi: 65374, Stride: 63}, + }, + R32: []unicode.Range32{}, + LatinOffset: 1, + }, + }, + "ru": { + Confusable: []rune{180, 215, 305, 921, 1009, 8216, 8217, 8245, 12494, 65281, 65283, 65288, 65289, 65292, 65306, 65307, 65311, 65374}, + With: []rune{96, 120, 105, 73, 112, 96, 96, 96, 47, 33, 35, 40, 41, 44, 58, 59, 63, 126}, + Locale: "ru", + RangeTable: &unicode.RangeTable{ + R16: []unicode.Range16{ + {Lo: 180, Hi: 215, Stride: 35}, + {Lo: 305, Hi: 921, Stride: 616}, + {Lo: 1009, Hi: 8216, Stride: 7207}, + {Lo: 8217, Hi: 8245, Stride: 28}, + {Lo: 12494, Hi: 65281, Stride: 52787}, + {Lo: 65283, Hi: 65288, Stride: 5}, + {Lo: 65289, Hi: 65292, Stride: 3}, + {Lo: 65306, Hi: 65307, Stride: 1}, + {Lo: 65311, Hi: 65374, Stride: 63}, + }, + R32: []unicode.Range32{}, + LatinOffset: 1, + }, + }, + "tr": { + Confusable: []rune{160, 180, 215, 921, 1009, 1040, 1042, 1045, 1047, 1050, 1052, 1053, 1054, 1056, 1057, 1058, 1059, 1061, 1068, 1072, 1073, 1075, 1077, 1086, 1088, 1089, 1091, 1093, 8211, 8245, 12494, 65281, 65283, 65288, 65289, 65292, 65306, 65307, 65311, 65374}, + With: []rune{32, 96, 120, 73, 112, 65, 66, 69, 51, 75, 77, 72, 79, 80, 67, 84, 89, 88, 98, 97, 54, 114, 101, 111, 112, 99, 121, 120, 45, 96, 47, 33, 35, 40, 41, 44, 58, 59, 63, 126}, + Locale: "tr", + RangeTable: &unicode.RangeTable{ + R16: []unicode.Range16{ + {Lo: 160, Hi: 180, Stride: 20}, + {Lo: 215, Hi: 921, Stride: 706}, + {Lo: 1009, Hi: 1040, Stride: 31}, + {Lo: 1042, Hi: 1045, Stride: 3}, + {Lo: 1047, Hi: 1050, Stride: 3}, + {Lo: 1052, Hi: 1054, Stride: 1}, + {Lo: 1056, Hi: 1059, Stride: 1}, + {Lo: 1061, Hi: 1068, Stride: 7}, + {Lo: 1072, Hi: 1073, Stride: 1}, + {Lo: 1075, Hi: 1077, Stride: 2}, + {Lo: 1086, Hi: 1088, Stride: 2}, + {Lo: 1089, Hi: 1093, Stride: 2}, + {Lo: 8211, Hi: 8245, Stride: 34}, + {Lo: 12494, Hi: 65281, Stride: 52787}, + {Lo: 65283, Hi: 65288, Stride: 5}, + {Lo: 65289, Hi: 65292, Stride: 3}, + {Lo: 65306, Hi: 65307, Stride: 1}, + {Lo: 65311, Hi: 65374, Stride: 63}, + }, + R32: []unicode.Range32{}, + LatinOffset: 1, + }, + }, + "zh-hans": { + Confusable: []rune{180, 215, 305, 921, 1009, 1040, 1042, 1045, 1047, 1050, 1052, 1053, 1054, 1056, 1057, 1058, 1059, 1061, 1068, 1072, 1073, 1075, 1077, 1086, 1088, 1089, 1091, 1093, 8245, 12494, 65281, 65288, 65289, 65306, 65374}, + With: []rune{96, 120, 105, 73, 112, 65, 66, 69, 51, 75, 77, 72, 79, 80, 67, 84, 89, 88, 98, 97, 54, 114, 101, 111, 112, 99, 121, 120, 96, 47, 33, 40, 41, 58, 126}, + Locale: "zh-hans", + RangeTable: &unicode.RangeTable{ + R16: []unicode.Range16{ + {Lo: 180, Hi: 215, Stride: 35}, + {Lo: 305, Hi: 921, Stride: 616}, + {Lo: 1009, Hi: 1040, Stride: 31}, + {Lo: 1042, Hi: 1045, Stride: 3}, + {Lo: 1047, Hi: 1050, Stride: 3}, + {Lo: 1052, Hi: 1054, Stride: 1}, + {Lo: 1056, Hi: 1059, Stride: 1}, + {Lo: 1061, Hi: 1068, Stride: 7}, + {Lo: 1072, Hi: 1073, Stride: 1}, + {Lo: 1075, Hi: 1077, Stride: 2}, + {Lo: 1086, Hi: 1088, Stride: 2}, + {Lo: 1089, Hi: 1093, Stride: 2}, + {Lo: 8245, Hi: 12494, Stride: 4249}, + {Lo: 65281, Hi: 65288, Stride: 7}, + {Lo: 65289, Hi: 65306, Stride: 17}, + {Lo: 65374, Hi: 65374, Stride: 1}, + }, + R32: []unicode.Range32{}, + LatinOffset: 1, + }, + }, + "zh-hant": { + Confusable: []rune{180, 215, 305, 921, 1009, 1040, 1042, 1045, 1047, 1050, 1052, 1053, 1054, 1056, 1057, 1058, 1059, 1061, 1068, 1072, 1073, 1075, 1077, 1086, 1088, 1089, 1091, 1093, 8211, 12494, 65283, 65307, 65374}, + With: []rune{96, 120, 105, 73, 112, 65, 66, 69, 51, 75, 77, 72, 79, 80, 67, 84, 89, 88, 98, 97, 54, 114, 101, 111, 112, 99, 121, 120, 45, 47, 35, 59, 126}, + Locale: "zh-hant", + RangeTable: &unicode.RangeTable{ + R16: []unicode.Range16{ + {Lo: 180, Hi: 215, Stride: 35}, + {Lo: 305, Hi: 921, Stride: 616}, + {Lo: 1009, Hi: 1040, Stride: 31}, + {Lo: 1042, Hi: 1045, Stride: 3}, + {Lo: 1047, Hi: 1050, Stride: 3}, + {Lo: 1052, Hi: 1054, Stride: 1}, + {Lo: 1056, Hi: 1059, Stride: 1}, + {Lo: 1061, Hi: 1068, Stride: 7}, + {Lo: 1072, Hi: 1073, Stride: 1}, + {Lo: 1075, Hi: 1077, Stride: 2}, + {Lo: 1086, Hi: 1088, Stride: 2}, + {Lo: 1089, Hi: 1093, Stride: 2}, + {Lo: 8211, Hi: 12494, Stride: 4283}, + {Lo: 65283, Hi: 65307, Stride: 24}, + {Lo: 65374, Hi: 65374, Stride: 1}, + }, + R32: []unicode.Range32{}, + LatinOffset: 1, + }, + }, +} diff --git a/modules/charset/ambiguous_gen_test.go b/modules/charset/ambiguous_gen_test.go new file mode 100644 index 0000000..221c27d --- /dev/null +++ b/modules/charset/ambiguous_gen_test.go @@ -0,0 +1,31 @@ +// Copyright 2022 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package charset + +import ( + "sort" + "testing" + "unicode" + + "github.com/stretchr/testify/assert" +) + +func TestAmbiguousCharacters(t *testing.T) { + for locale, ambiguous := range AmbiguousCharacters { + assert.Equal(t, locale, ambiguous.Locale) + assert.Equal(t, len(ambiguous.Confusable), len(ambiguous.With)) + assert.True(t, sort.SliceIsSorted(ambiguous.Confusable, func(i, j int) bool { + return ambiguous.Confusable[i] < ambiguous.Confusable[j] + })) + + for _, confusable := range ambiguous.Confusable { + assert.True(t, unicode.Is(ambiguous.RangeTable, confusable)) + i := sort.Search(len(ambiguous.Confusable), func(j int) bool { + return ambiguous.Confusable[j] >= confusable + }) + found := i < len(ambiguous.Confusable) && ambiguous.Confusable[i] == confusable + assert.True(t, found, "%c is not in %d", confusable, i) + } + } +} diff --git a/modules/charset/breakwriter.go b/modules/charset/breakwriter.go new file mode 100644 index 0000000..a87e846 --- /dev/null +++ b/modules/charset/breakwriter.go @@ -0,0 +1,43 @@ +// Copyright 2022 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package charset + +import ( + "bytes" + "io" +) + +// BreakWriter wraps an io.Writer to always write '\n' as '<br>' +type BreakWriter struct { + io.Writer +} + +// Write writes the provided byte slice transparently replacing '\n' with '<br>' +func (b *BreakWriter) Write(bs []byte) (n int, err error) { + pos := 0 + for pos < len(bs) { + idx := bytes.IndexByte(bs[pos:], '\n') + if idx < 0 { + wn, err := b.Writer.Write(bs[pos:]) + return n + wn, err + } + + if idx > 0 { + wn, err := b.Writer.Write(bs[pos : pos+idx]) + n += wn + if err != nil { + return n, err + } + } + + if _, err = b.Writer.Write([]byte("<br>")); err != nil { + return n, err + } + pos += idx + 1 + + n++ + } + + return n, err +} diff --git a/modules/charset/breakwriter_test.go b/modules/charset/breakwriter_test.go new file mode 100644 index 0000000..5eeeedc --- /dev/null +++ b/modules/charset/breakwriter_test.go @@ -0,0 +1,68 @@ +// Copyright 2022 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package charset + +import ( + "strings" + "testing" +) + +func TestBreakWriter_Write(t *testing.T) { + tests := []struct { + name string + kase string + expect string + wantErr bool + }{ + { + name: "noline", + kase: "abcdefghijklmnopqrstuvwxyz", + expect: "abcdefghijklmnopqrstuvwxyz", + }, + { + name: "endline", + kase: "abcdefghijklmnopqrstuvwxyz\n", + expect: "abcdefghijklmnopqrstuvwxyz<br>", + }, + { + name: "startline", + kase: "\nabcdefghijklmnopqrstuvwxyz", + expect: "<br>abcdefghijklmnopqrstuvwxyz", + }, + { + name: "onlyline", + kase: "\n\n\n", + expect: "<br><br><br>", + }, + { + name: "empty", + kase: "", + expect: "", + }, + { + name: "midline", + kase: "\nabc\ndefghijkl\nmnopqrstuvwxy\nz", + expect: "<br>abc<br>defghijkl<br>mnopqrstuvwxy<br>z", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + buf := &strings.Builder{} + b := &BreakWriter{ + Writer: buf, + } + n, err := b.Write([]byte(tt.kase)) + if (err != nil) != tt.wantErr { + t.Errorf("BreakWriter.Write() error = %v, wantErr %v", err, tt.wantErr) + return + } + if n != len(tt.kase) { + t.Errorf("BreakWriter.Write() = %v, want %v", n, len(tt.kase)) + } + if buf.String() != tt.expect { + t.Errorf("BreakWriter.Write() wrote %q, want %v", buf.String(), tt.expect) + } + }) + } +} diff --git a/modules/charset/charset.go b/modules/charset/charset.go new file mode 100644 index 0000000..1855446 --- /dev/null +++ b/modules/charset/charset.go @@ -0,0 +1,211 @@ +// Copyright 2014 The Gogs Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package charset + +import ( + "bytes" + "fmt" + "io" + "strings" + "unicode/utf8" + + "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/modules/util" + + "github.com/gogs/chardet" + "golang.org/x/net/html/charset" + "golang.org/x/text/transform" +) + +// UTF8BOM is the utf-8 byte-order marker +var UTF8BOM = []byte{'\xef', '\xbb', '\xbf'} + +type ConvertOpts struct { + KeepBOM bool +} + +// ToUTF8WithFallbackReader detects the encoding of content and converts to UTF-8 reader if possible +func ToUTF8WithFallbackReader(rd io.Reader, opts ConvertOpts) io.Reader { + buf := make([]byte, 2048) + n, err := util.ReadAtMost(rd, buf) + if err != nil { + return io.MultiReader(bytes.NewReader(MaybeRemoveBOM(buf[:n], opts)), rd) + } + + charsetLabel, err := DetectEncoding(buf[:n]) + if err != nil || charsetLabel == "UTF-8" { + return io.MultiReader(bytes.NewReader(MaybeRemoveBOM(buf[:n], opts)), rd) + } + + encoding, _ := charset.Lookup(charsetLabel) + if encoding == nil { + return io.MultiReader(bytes.NewReader(buf[:n]), rd) + } + + return transform.NewReader( + io.MultiReader( + bytes.NewReader(MaybeRemoveBOM(buf[:n], opts)), + rd, + ), + encoding.NewDecoder(), + ) +} + +// ToUTF8 converts content to UTF8 encoding +func ToUTF8(content []byte, opts ConvertOpts) (string, error) { + charsetLabel, err := DetectEncoding(content) + if err != nil { + return "", err + } else if charsetLabel == "UTF-8" { + return string(MaybeRemoveBOM(content, opts)), nil + } + + encoding, _ := charset.Lookup(charsetLabel) + if encoding == nil { + return string(content), fmt.Errorf("Unknown encoding: %s", charsetLabel) + } + + // If there is an error, we concatenate the nicely decoded part and the + // original left over. This way we won't lose much data. + result, n, err := transform.Bytes(encoding.NewDecoder(), content) + if err != nil { + result = append(result, content[n:]...) + } + + result = MaybeRemoveBOM(result, opts) + + return string(result), err +} + +// ToUTF8WithFallback detects the encoding of content and converts to UTF-8 if possible +func ToUTF8WithFallback(content []byte, opts ConvertOpts) []byte { + bs, _ := io.ReadAll(ToUTF8WithFallbackReader(bytes.NewReader(content), opts)) + return bs +} + +// ToUTF8DropErrors makes sure the return string is valid utf-8; attempts conversion if possible +func ToUTF8DropErrors(content []byte, opts ConvertOpts) []byte { + charsetLabel, err := DetectEncoding(content) + if err != nil || charsetLabel == "UTF-8" { + return MaybeRemoveBOM(content, opts) + } + + encoding, _ := charset.Lookup(charsetLabel) + if encoding == nil { + return content + } + + // We ignore any non-decodable parts from the file. + // Some parts might be lost + var decoded []byte + decoder := encoding.NewDecoder() + idx := 0 + for { + result, n, err := transform.Bytes(decoder, content[idx:]) + decoded = append(decoded, result...) + if err == nil { + break + } + decoded = append(decoded, ' ') + idx = idx + n + 1 + if idx >= len(content) { + break + } + } + + return MaybeRemoveBOM(decoded, opts) +} + +// MaybeRemoveBOM removes a UTF-8 BOM from a []byte when opts.KeepBOM is false +func MaybeRemoveBOM(content []byte, opts ConvertOpts) []byte { + if opts.KeepBOM { + return content + } + if len(content) > 2 && bytes.Equal(content[0:3], UTF8BOM) { + return content[3:] + } + return content +} + +// DetectEncoding detect the encoding of content +func DetectEncoding(content []byte) (string, error) { + // First we check if the content represents valid utf8 content excepting a truncated character at the end. + + // Now we could decode all the runes in turn but this is not necessarily the cheapest thing to do + // instead we walk backwards from the end to trim off a the incomplete character + toValidate := content + end := len(toValidate) - 1 + + if end < 0 { + // no-op + } else if toValidate[end]>>5 == 0b110 { + // Incomplete 1 byte extension e.g. © <c2><a9> which has been truncated to <c2> + toValidate = toValidate[:end] + } else if end > 0 && toValidate[end]>>6 == 0b10 && toValidate[end-1]>>4 == 0b1110 { + // Incomplete 2 byte extension e.g. ⛔ <e2><9b><94> which has been truncated to <e2><9b> + toValidate = toValidate[:end-1] + } else if end > 1 && toValidate[end]>>6 == 0b10 && toValidate[end-1]>>6 == 0b10 && toValidate[end-2]>>3 == 0b11110 { + // Incomplete 3 byte extension e.g. 💩 <f0><9f><92><a9> which has been truncated to <f0><9f><92> + toValidate = toValidate[:end-2] + } + if utf8.Valid(toValidate) { + log.Debug("Detected encoding: utf-8 (fast)") + return "UTF-8", nil + } + + textDetector := chardet.NewTextDetector() + var detectContent []byte + if len(content) < 1024 { + // Check if original content is valid + if _, err := textDetector.DetectBest(content); err != nil { + return "", err + } + times := 1024 / len(content) + detectContent = make([]byte, 0, times*len(content)) + for i := 0; i < times; i++ { + detectContent = append(detectContent, content...) + } + } else { + detectContent = content + } + + // Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie break + results, err := textDetector.DetectAll(detectContent) + if err != nil { + if err == chardet.NotDetectedError && len(setting.Repository.AnsiCharset) > 0 { + log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset) + return setting.Repository.AnsiCharset, nil + } + return "", err + } + + topConfidence := results[0].Confidence + topResult := results[0] + priority, has := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(topResult.Charset))] + for _, result := range results { + // As results are sorted in confidence order - if we have a different confidence + // we know it's less than the current confidence and can break out of the loop early + if result.Confidence != topConfidence { + break + } + + // Otherwise check if this results is earlier in the DetectedCharsetOrder than our current top guess + resultPriority, resultHas := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(result.Charset))] + if resultHas && (!has || resultPriority < priority) { + topResult = result + priority = resultPriority + has = true + } + } + + // FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument + if topResult.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 { + log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset) + return setting.Repository.AnsiCharset, err + } + + log.Debug("Detected encoding: %s", topResult.Charset) + return topResult.Charset, err +} diff --git a/modules/charset/charset_test.go b/modules/charset/charset_test.go new file mode 100644 index 0000000..42c8415 --- /dev/null +++ b/modules/charset/charset_test.go @@ -0,0 +1,385 @@ +// Copyright 2019 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package charset + +import ( + "bytes" + "io" + "strings" + "testing" + + "code.gitea.io/gitea/modules/setting" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func resetDefaultCharsetsOrder() { + defaultDetectedCharsetsOrder := make([]string, 0, len(setting.Repository.DetectedCharsetsOrder)) + for _, charset := range setting.Repository.DetectedCharsetsOrder { + defaultDetectedCharsetsOrder = append(defaultDetectedCharsetsOrder, strings.ToLower(strings.TrimSpace(charset))) + } + setting.Repository.DetectedCharsetScore = map[string]int{} + i := 0 + for _, charset := range defaultDetectedCharsetsOrder { + canonicalCharset := strings.ToLower(strings.TrimSpace(charset)) + if _, has := setting.Repository.DetectedCharsetScore[canonicalCharset]; !has { + setting.Repository.DetectedCharsetScore[canonicalCharset] = i + i++ + } + } +} + +func TestMaybeRemoveBOM(t *testing.T) { + res := MaybeRemoveBOM([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{}) + assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res) + + res = MaybeRemoveBOM([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{}) + assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res) +} + +func TestToUTF8(t *testing.T) { + resetDefaultCharsetsOrder() + var res string + var err error + + // Note: golang compiler seems so behave differently depending on the current + // locale, so some conversions might behave differently. For that reason, we don't + // depend on particular conversions but in expected behaviors. + + res, err = ToUTF8([]byte{0x41, 0x42, 0x43}, ConvertOpts{}) + require.NoError(t, err) + assert.Equal(t, "ABC", res) + + // "áéíóú" + res, err = ToUTF8([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{}) + require.NoError(t, err) + assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res)) + + // "áéíóú" + res, err = ToUTF8([]byte{ + 0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, + 0xc3, 0xba, + }, ConvertOpts{}) + require.NoError(t, err) + assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res)) + + res, err = ToUTF8([]byte{ + 0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, + 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e, + }, ConvertOpts{}) + require.NoError(t, err) + stringMustStartWith(t, "Hola,", res) + stringMustEndWith(t, "AAA.", res) + + res, err = ToUTF8([]byte{ + 0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, + 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e, + }, ConvertOpts{}) + require.NoError(t, err) + stringMustStartWith(t, "Hola,", res) + stringMustEndWith(t, "AAA.", res) + + res, err = ToUTF8([]byte{ + 0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, + 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e, + }, ConvertOpts{}) + require.NoError(t, err) + stringMustStartWith(t, "Hola,", res) + stringMustEndWith(t, "AAA.", res) + + // Japanese (Shift-JIS) + // 日属秘ぞしちゅ。 + res, err = ToUTF8([]byte{ + 0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, + 0xBF, 0x82, 0xE3, 0x81, 0x42, + }, ConvertOpts{}) + require.NoError(t, err) + assert.Equal(t, []byte{ + 0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3, + 0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82, + }, + []byte(res)) + + res, err = ToUTF8([]byte{0x00, 0x00, 0x00, 0x00}, ConvertOpts{}) + require.NoError(t, err) + assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, []byte(res)) +} + +func TestToUTF8WithFallback(t *testing.T) { + resetDefaultCharsetsOrder() + // "ABC" + res := ToUTF8WithFallback([]byte{0x41, 0x42, 0x43}, ConvertOpts{}) + assert.Equal(t, []byte{0x41, 0x42, 0x43}, res) + + // "áéíóú" + res = ToUTF8WithFallback([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{}) + assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res) + + // UTF8 BOM + "áéíóú" + res = ToUTF8WithFallback([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{}) + assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res) + + // "Hola, así cómo ños" + res = ToUTF8WithFallback([]byte{ + 0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, + 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73, + }, ConvertOpts{}) + assert.Equal(t, []byte{ + 0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63, + 0xC3, 0xB3, 0x6D, 0x6F, 0x20, 0xC3, 0xB1, 0x6F, 0x73, + }, res) + + // "Hola, así cómo " + minmatch := []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63, 0xC3, 0xB3, 0x6D, 0x6F, 0x20} + + res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73}, ConvertOpts{}) + // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those + assert.Equal(t, minmatch, res[0:len(minmatch)]) + + res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73}, ConvertOpts{}) + // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those + assert.Equal(t, minmatch, res[0:len(minmatch)]) + + // Japanese (Shift-JIS) + // "日属秘ぞしちゅ。" + res = ToUTF8WithFallback([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42}, ConvertOpts{}) + assert.Equal(t, []byte{ + 0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3, + 0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82, + }, res) + + res = ToUTF8WithFallback([]byte{0x00, 0x00, 0x00, 0x00}, ConvertOpts{}) + assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res) +} + +func TestToUTF8DropErrors(t *testing.T) { + resetDefaultCharsetsOrder() + // "ABC" + res := ToUTF8DropErrors([]byte{0x41, 0x42, 0x43}, ConvertOpts{}) + assert.Equal(t, []byte{0x41, 0x42, 0x43}, res) + + // "áéíóú" + res = ToUTF8DropErrors([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{}) + assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res) + + // UTF8 BOM + "áéíóú" + res = ToUTF8DropErrors([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{}) + assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res) + + // "Hola, así cómo ños" + res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73}, ConvertOpts{}) + assert.Equal(t, []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73}, res[:8]) + assert.Equal(t, []byte{0x73}, res[len(res)-1:]) + + // "Hola, así cómo " + minmatch := []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63, 0xC3, 0xB3, 0x6D, 0x6F, 0x20} + + res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73}, ConvertOpts{}) + // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those + assert.Equal(t, minmatch, res[0:len(minmatch)]) + + res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73}, ConvertOpts{}) + // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those + assert.Equal(t, minmatch, res[0:len(minmatch)]) + + // Japanese (Shift-JIS) + // "日属秘ぞしちゅ。" + res = ToUTF8DropErrors([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42}, ConvertOpts{}) + assert.Equal(t, []byte{ + 0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3, + 0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82, + }, res) + + res = ToUTF8DropErrors([]byte{0x00, 0x00, 0x00, 0x00}, ConvertOpts{}) + assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res) +} + +func TestDetectEncoding(t *testing.T) { + resetDefaultCharsetsOrder() + testSuccess := func(b []byte, expected string) { + encoding, err := DetectEncoding(b) + require.NoError(t, err) + assert.Equal(t, expected, encoding) + } + // utf-8 + b := []byte("just some ascii") + testSuccess(b, "UTF-8") + + // utf-8-sig: "hey" (with BOM) + b = []byte{0xef, 0xbb, 0xbf, 0x68, 0x65, 0x79} + testSuccess(b, "UTF-8") + + // utf-16: "hey<accented G>" + b = []byte{0xff, 0xfe, 0x68, 0x00, 0x65, 0x00, 0x79, 0x00, 0xf4, 0x01} + testSuccess(b, "UTF-16LE") + + // iso-8859-1: d<accented e>cor<newline> + b = []byte{0x44, 0xe9, 0x63, 0x6f, 0x72, 0x0a} + encoding, err := DetectEncoding(b) + require.NoError(t, err) + assert.Contains(t, encoding, "ISO-8859-1") + + old := setting.Repository.AnsiCharset + setting.Repository.AnsiCharset = "placeholder" + defer func() { + setting.Repository.AnsiCharset = old + }() + testSuccess(b, "placeholder") + + // invalid bytes + b = []byte{0xfa} + _, err = DetectEncoding(b) + require.Error(t, err) +} + +func stringMustStartWith(t *testing.T, expected, value string) { + assert.Equal(t, expected, value[:len(expected)]) +} + +func stringMustEndWith(t *testing.T, expected, value string) { + assert.Equal(t, expected, value[len(value)-len(expected):]) +} + +func TestToUTF8WithFallbackReader(t *testing.T) { + resetDefaultCharsetsOrder() + + for testLen := 0; testLen < 2048; testLen++ { + pattern := " test { () }\n" + input := "" + for len(input) < testLen { + input += pattern + } + input = input[:testLen] + input += "// Выключаем" + rd := ToUTF8WithFallbackReader(bytes.NewReader([]byte(input)), ConvertOpts{}) + r, _ := io.ReadAll(rd) + assert.EqualValuesf(t, input, string(r), "testing string len=%d", testLen) + } + + truncatedOneByteExtension := failFastBytes + encoding, _ := DetectEncoding(truncatedOneByteExtension) + assert.Equal(t, "UTF-8", encoding) + + truncatedTwoByteExtension := failFastBytes + truncatedTwoByteExtension[len(failFastBytes)-1] = 0x9b + truncatedTwoByteExtension[len(failFastBytes)-2] = 0xe2 + + encoding, _ = DetectEncoding(truncatedTwoByteExtension) + assert.Equal(t, "UTF-8", encoding) + + truncatedThreeByteExtension := failFastBytes + truncatedThreeByteExtension[len(failFastBytes)-1] = 0x92 + truncatedThreeByteExtension[len(failFastBytes)-2] = 0x9f + truncatedThreeByteExtension[len(failFastBytes)-3] = 0xf0 + + encoding, _ = DetectEncoding(truncatedThreeByteExtension) + assert.Equal(t, "UTF-8", encoding) +} + +var failFastBytes = []byte{ + 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x6f, 0x72, 0x67, 0x2e, 0x61, 0x70, 0x61, 0x63, 0x68, 0x65, 0x2e, 0x74, 0x6f, + 0x6f, 0x6c, 0x73, 0x2e, 0x61, 0x6e, 0x74, 0x2e, 0x74, 0x61, 0x73, 0x6b, 0x64, 0x65, 0x66, 0x73, 0x2e, 0x63, 0x6f, 0x6e, + 0x64, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x4f, 0x73, 0x0a, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x6f, 0x72, 0x67, + 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x62, 0x6f, 0x6f, + 0x74, 0x2e, 0x67, 0x72, 0x61, 0x64, 0x6c, 0x65, 0x2e, 0x74, 0x61, 0x73, 0x6b, 0x73, 0x2e, 0x72, 0x75, 0x6e, 0x2e, 0x42, + 0x6f, 0x6f, 0x74, 0x52, 0x75, 0x6e, 0x0a, 0x0a, 0x70, 0x6c, 0x75, 0x67, 0x69, 0x6e, 0x73, 0x20, 0x7b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x69, 0x64, 0x28, 0x22, 0x6f, 0x72, 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, + 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x62, 0x6f, 0x6f, 0x74, 0x22, 0x29, 0x0a, 0x7d, 0x0a, 0x0a, 0x64, 0x65, 0x70, 0x65, + 0x6e, 0x64, 0x65, 0x6e, 0x63, 0x69, 0x65, 0x73, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, + 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x22, 0x3a, + 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x3a, 0x61, 0x70, 0x69, 0x22, 0x29, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, + 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, + 0x28, 0x22, 0x3a, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x3a, 0x61, 0x70, 0x69, 0x2d, 0x64, 0x6f, 0x63, 0x73, 0x22, 0x29, + 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, + 0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x22, 0x3a, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x3a, 0x64, 0x62, + 0x22, 0x29, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, + 0x6f, 0x6e, 0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x22, 0x3a, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x3a, + 0x73, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x22, 0x29, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, + 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x22, 0x3a, + 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x3a, 0x69, 0x6e, 0x74, 0x65, 0x67, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x2d, 0x66, + 0x73, 0x22, 0x29, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, + 0x69, 0x6f, 0x6e, 0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x22, 0x3a, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, + 0x3a, 0x69, 0x6e, 0x74, 0x65, 0x67, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x2d, 0x6d, 0x71, 0x22, 0x29, 0x29, 0x0a, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, + 0x6a, 0x66, 0x75, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x70, 0x65, 0x3a, 0x70, 0x65, 0x2d, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, + 0x2d, 0x61, 0x75, 0x74, 0x68, 0x2d, 0x72, 0x65, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x2d, 0x73, 0x74, 0x61, 0x72, 0x74, + 0x65, 0x72, 0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, + 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6a, 0x66, 0x75, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x70, 0x65, 0x3a, 0x70, 0x65, 0x2d, 0x63, + 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2d, 0x68, 0x61, 0x6c, 0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, + 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6a, 0x66, 0x75, 0x73, 0x69, 0x6f, 0x6e, 0x2e, + 0x70, 0x65, 0x3a, 0x70, 0x65, 0x2d, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2d, 0x63, 0x6f, 0x72, 0x65, 0x22, 0x29, 0x0a, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, + 0x22, 0x6f, 0x72, 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, + 0x2e, 0x62, 0x6f, 0x6f, 0x74, 0x3a, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x62, 0x6f, 0x6f, 0x74, 0x2d, 0x73, 0x74, + 0x61, 0x72, 0x74, 0x65, 0x72, 0x2d, 0x77, 0x65, 0x62, 0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, + 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6f, 0x72, 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, + 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x62, 0x6f, 0x6f, 0x74, 0x3a, 0x73, 0x70, 0x72, + 0x69, 0x6e, 0x67, 0x2d, 0x62, 0x6f, 0x6f, 0x74, 0x2d, 0x73, 0x74, 0x61, 0x72, 0x74, 0x65, 0x72, 0x2d, 0x61, 0x6f, 0x70, + 0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, + 0x6e, 0x28, 0x22, 0x6f, 0x72, 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, + 0x72, 0x6b, 0x2e, 0x62, 0x6f, 0x6f, 0x74, 0x3a, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x62, 0x6f, 0x6f, 0x74, 0x2d, + 0x73, 0x74, 0x61, 0x72, 0x74, 0x65, 0x72, 0x2d, 0x61, 0x63, 0x74, 0x75, 0x61, 0x74, 0x6f, 0x72, 0x22, 0x29, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6f, + 0x72, 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x63, + 0x6c, 0x6f, 0x75, 0x64, 0x3a, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6c, 0x6f, 0x75, 0x64, 0x2d, 0x73, 0x74, + 0x61, 0x72, 0x74, 0x65, 0x72, 0x2d, 0x62, 0x6f, 0x6f, 0x74, 0x73, 0x74, 0x72, 0x61, 0x70, 0x22, 0x29, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6f, 0x72, + 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x63, 0x6c, + 0x6f, 0x75, 0x64, 0x3a, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6c, 0x6f, 0x75, 0x64, 0x2d, 0x73, 0x74, 0x61, + 0x72, 0x74, 0x65, 0x72, 0x2d, 0x63, 0x6f, 0x6e, 0x73, 0x75, 0x6c, 0x2d, 0x61, 0x6c, 0x6c, 0x22, 0x29, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6f, 0x72, + 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x63, 0x6c, + 0x6f, 0x75, 0x64, 0x3a, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6c, 0x6f, 0x75, 0x64, 0x2d, 0x73, 0x74, 0x61, + 0x72, 0x74, 0x65, 0x72, 0x2d, 0x73, 0x6c, 0x65, 0x75, 0x74, 0x68, 0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, + 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6f, 0x72, 0x67, 0x2e, 0x73, 0x70, + 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x72, 0x65, 0x74, 0x72, 0x79, 0x3a, + 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x72, 0x65, 0x74, 0x72, 0x79, 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x63, 0x68, 0x2e, 0x71, + 0x6f, 0x73, 0x2e, 0x6c, 0x6f, 0x67, 0x62, 0x61, 0x63, 0x6b, 0x3a, 0x6c, 0x6f, 0x67, 0x62, 0x61, 0x63, 0x6b, 0x2d, 0x63, + 0x6c, 0x61, 0x73, 0x73, 0x69, 0x63, 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, + 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x69, 0x6f, 0x2e, 0x6d, 0x69, 0x63, 0x72, 0x6f, 0x6d, 0x65, + 0x74, 0x65, 0x72, 0x3a, 0x6d, 0x69, 0x63, 0x72, 0x6f, 0x6d, 0x65, 0x74, 0x65, 0x72, 0x2d, 0x72, 0x65, 0x67, 0x69, 0x73, + 0x74, 0x72, 0x79, 0x2d, 0x70, 0x72, 0x6f, 0x6d, 0x65, 0x74, 0x68, 0x65, 0x75, 0x73, 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x6b, 0x6f, 0x74, + 0x6c, 0x69, 0x6e, 0x28, 0x22, 0x73, 0x74, 0x64, 0x6c, 0x69, 0x62, 0x22, 0x29, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, + 0x2f, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x54, 0x65, 0x73, 0x74, 0x20, 0x64, 0x65, 0x70, 0x65, 0x6e, 0x64, + 0x65, 0x6e, 0x63, 0x69, 0x65, 0x73, 0x2e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, + 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, + 0x65, 0x73, 0x74, 0x49, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6a, + 0x66, 0x75, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x70, 0x65, 0x3a, 0x70, 0x65, 0x2d, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2d, + 0x74, 0x65, 0x73, 0x74, 0x22, 0x29, 0x0a, 0x7d, 0x0a, 0x0a, 0x76, 0x61, 0x6c, 0x20, 0x70, 0x61, 0x74, 0x63, 0x68, 0x4a, + 0x61, 0x72, 0x20, 0x62, 0x79, 0x20, 0x74, 0x61, 0x73, 0x6b, 0x73, 0x2e, 0x72, 0x65, 0x67, 0x69, 0x73, 0x74, 0x65, 0x72, + 0x69, 0x6e, 0x67, 0x28, 0x4a, 0x61, 0x72, 0x3a, 0x3a, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x61, 0x72, 0x63, 0x68, 0x69, 0x76, 0x65, 0x43, 0x6c, 0x61, 0x73, 0x73, 0x69, 0x66, 0x69, 0x65, 0x72, 0x2e, + 0x73, 0x65, 0x74, 0x28, 0x22, 0x70, 0x61, 0x74, 0x63, 0x68, 0x65, 0x64, 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x76, 0x61, 0x6c, 0x20, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x43, 0x6c, 0x61, 0x73, 0x73, 0x70, 0x61, 0x74, 0x68, + 0x20, 0x62, 0x79, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x75, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x2e, 0x67, + 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x6e, 0x69, 0x66, 0x65, 0x73, 0x74, + 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, + 0x73, 0x28, 0x22, 0x43, 0x6c, 0x61, 0x73, 0x73, 0x2d, 0x50, 0x61, 0x74, 0x68, 0x22, 0x20, 0x74, 0x6f, 0x20, 0x6f, 0x62, + 0x6a, 0x65, 0x63, 0x74, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, + 0x72, 0x69, 0x76, 0x61, 0x74, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x20, 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x20, 0x3d, + 0x20, 0x22, 0x66, 0x69, 0x6c, 0x65, 0x3a, 0x2f, 0x2b, 0x22, 0x2e, 0x74, 0x6f, 0x52, 0x65, 0x67, 0x65, 0x78, 0x28, 0x29, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x76, 0x65, 0x72, 0x72, 0x69, 0x64, + 0x65, 0x20, 0x66, 0x75, 0x6e, 0x20, 0x74, 0x6f, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, 0x29, 0x3a, 0x20, 0x53, 0x74, + 0x72, 0x69, 0x6e, 0x67, 0x20, 0x3d, 0x20, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x43, 0x6c, 0x61, 0x73, 0x73, 0x70, + 0x61, 0x74, 0x68, 0x2e, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x54, 0x6f, 0x53, 0x74, 0x72, 0x69, + 0x6e, 0x67, 0x28, 0x22, 0x20, 0x22, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x74, 0x2e, 0x74, 0x6f, 0x55, 0x52, 0x49, 0x28, 0x29, 0x2e, 0x74, 0x6f, 0x55, + 0x52, 0x4c, 0x28, 0x29, 0x2e, 0x74, 0x6f, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, 0x29, 0x2e, 0x72, 0x65, 0x70, 0x6c, + 0x61, 0x63, 0x65, 0x46, 0x69, 0x72, 0x73, 0x74, 0x28, 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x2c, 0x20, 0x22, 0x2f, + 0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x74, 0x61, 0x73, + 0x6b, 0x73, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x64, 0x3c, 0x42, 0x6f, 0x6f, 0x74, 0x52, 0x75, 0x6e, 0x3e, 0x28, 0x22, 0x62, + 0x6f, 0x6f, 0x74, 0x52, 0x75, 0x6e, 0x22, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x4f, + 0x73, 0x2e, 0x69, 0x73, 0x46, 0x61, 0x6d, 0x69, 0x6c, 0x79, 0x28, 0x4f, 0x73, 0x2e, 0x46, 0x41, 0x4d, 0x49, 0x4c, 0x59, + 0x5f, 0x57, 0x49, 0x4e, 0x44, 0x4f, 0x57, 0x53, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x70, 0x61, 0x74, 0x68, 0x20, 0x3d, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x28, 0x73, + 0x6f, 0x75, 0x72, 0x63, 0x65, 0x53, 0x65, 0x74, 0x73, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x64, 0x28, 0x22, 0x6d, 0x61, 0x69, + 0x6e, 0x22, 0x29, 0x2e, 0x6d, 0x61, 0x70, 0x20, 0x7b, 0x20, 0x69, 0x74, 0x2e, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x20, + 0x7d, 0x2c, 0x20, 0x70, 0x61, 0x74, 0x63, 0x68, 0x4a, 0x61, 0x72, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0xd0, +} diff --git a/modules/charset/escape.go b/modules/charset/escape.go new file mode 100644 index 0000000..ba0eb73 --- /dev/null +++ b/modules/charset/escape.go @@ -0,0 +1,58 @@ +// Copyright 2022 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +//go:generate go run invisible/generate.go -v -o ./invisible_gen.go + +//go:generate go run ambiguous/generate.go -v -o ./ambiguous_gen.go ambiguous/ambiguous.json + +package charset + +import ( + "html/template" + "io" + "slices" + "strings" + + "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/modules/translation" +) + +// RuneNBSP is the codepoint for NBSP +const RuneNBSP = 0xa0 + +type escapeContext string + +// Keep this consistent with the documentation of [ui].SKIP_ESCAPE_CONTEXTS +// Defines the different contexts that could be used to escape in. +const ( + // Wiki pages. + WikiContext escapeContext = "wiki" + // Rendered content (except markup), source code and blames. + FileviewContext escapeContext = "file-view" + // Commits or pull requet's diff. + DiffContext escapeContext = "diff" +) + +// EscapeControlHTML escapes the unicode control sequences in a provided html document +func EscapeControlHTML(html template.HTML, locale translation.Locale, context escapeContext, allowed ...rune) (escaped *EscapeStatus, output template.HTML) { + sb := &strings.Builder{} + escaped, _ = EscapeControlReader(strings.NewReader(string(html)), sb, locale, context, allowed...) // err has been handled in EscapeControlReader + return escaped, template.HTML(sb.String()) +} + +// EscapeControlReader escapes the unicode control sequences in a provided reader of HTML content and writer in a locale and returns the findings as an EscapeStatus +func EscapeControlReader(reader io.Reader, writer io.Writer, locale translation.Locale, context escapeContext, allowed ...rune) (escaped *EscapeStatus, err error) { + if !setting.UI.AmbiguousUnicodeDetection || slices.Contains(setting.UI.SkipEscapeContexts, string(context)) { + _, err = io.Copy(writer, reader) + return &EscapeStatus{}, err + } + outputStream := &HTMLStreamerWriter{Writer: writer} + streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer) + + if err = StreamHTML(reader, streamer); err != nil { + streamer.escaped.HasError = true + log.Error("Error whilst escaping: %v", err) + } + return streamer.escaped, err +} diff --git a/modules/charset/escape_status.go b/modules/charset/escape_status.go new file mode 100644 index 0000000..37b6ad8 --- /dev/null +++ b/modules/charset/escape_status.go @@ -0,0 +1,27 @@ +// Copyright 2021 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package charset + +// EscapeStatus represents the findings of the unicode escaper +type EscapeStatus struct { + Escaped bool + HasError bool + HasBadRunes bool + HasInvisible bool + HasAmbiguous bool +} + +// Or combines two EscapeStatus structs into one representing the conjunction of the two +func (status *EscapeStatus) Or(other *EscapeStatus) *EscapeStatus { + st := status + if status == nil { + st = &EscapeStatus{} + } + st.Escaped = st.Escaped || other.Escaped + st.HasError = st.HasError || other.HasError + st.HasBadRunes = st.HasBadRunes || other.HasBadRunes + st.HasAmbiguous = st.HasAmbiguous || other.HasAmbiguous + st.HasInvisible = st.HasInvisible || other.HasInvisible + return st +} diff --git a/modules/charset/escape_stream.go b/modules/charset/escape_stream.go new file mode 100644 index 0000000..29943eb --- /dev/null +++ b/modules/charset/escape_stream.go @@ -0,0 +1,289 @@ +// Copyright 2022 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package charset + +import ( + "fmt" + "regexp" + "strings" + "unicode" + "unicode/utf8" + + "code.gitea.io/gitea/modules/translation" + + "golang.org/x/net/html" +) + +// VScode defaultWordRegexp +var defaultWordRegexp = regexp.MustCompile(`(-?\d*\.\d\w*)|([^\` + "`" + `\~\!\@\#\$\%\^\&\*\(\)\-\=\+\[\{\]\}\\\|\;\:\'\"\,\.\<\>\/\?\s\x00-\x1f]+)`) + +func NewEscapeStreamer(locale translation.Locale, next HTMLStreamer, allowed ...rune) HTMLStreamer { + allowedM := make(map[rune]bool, len(allowed)) + for _, v := range allowed { + allowedM[v] = true + } + return &escapeStreamer{ + escaped: &EscapeStatus{}, + PassthroughHTMLStreamer: *NewPassthroughStreamer(next), + locale: locale, + ambiguousTables: AmbiguousTablesForLocale(locale), + allowed: allowedM, + } +} + +type escapeStreamer struct { + PassthroughHTMLStreamer + escaped *EscapeStatus + locale translation.Locale + ambiguousTables []*AmbiguousTable + allowed map[rune]bool +} + +func (e *escapeStreamer) EscapeStatus() *EscapeStatus { + return e.escaped +} + +// Text tells the next streamer there is a text +func (e *escapeStreamer) Text(data string) error { + sb := &strings.Builder{} + var until int + var next int + pos := 0 + if len(data) > len(UTF8BOM) && data[:len(UTF8BOM)] == string(UTF8BOM) { + _, _ = sb.WriteString(data[:len(UTF8BOM)]) + pos = len(UTF8BOM) + } + dataBytes := []byte(data) + for pos < len(data) { + nextIdxs := defaultWordRegexp.FindStringIndex(data[pos:]) + if nextIdxs == nil { + until = len(data) + next = until + } else { + until, next = nextIdxs[0]+pos, nextIdxs[1]+pos + } + + // from pos until we know that the runes are not \r\t\n or even ' ' + runes := make([]rune, 0, next-until) + positions := make([]int, 0, next-until+1) + + for pos < until { + r, sz := utf8.DecodeRune(dataBytes[pos:]) + positions = positions[:0] + positions = append(positions, pos, pos+sz) + types, confusables, _ := e.runeTypes(r) + if err := e.handleRunes(dataBytes, []rune{r}, positions, types, confusables, sb); err != nil { + return err + } + pos += sz + } + + for i := pos; i < next; { + r, sz := utf8.DecodeRune(dataBytes[i:]) + runes = append(runes, r) + positions = append(positions, i) + i += sz + } + positions = append(positions, next) + types, confusables, runeCounts := e.runeTypes(runes...) + if runeCounts.needsEscape() { + if err := e.handleRunes(dataBytes, runes, positions, types, confusables, sb); err != nil { + return err + } + } else { + _, _ = sb.Write(dataBytes[pos:next]) + } + pos = next + } + if sb.Len() > 0 { + if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil { + return err + } + } + return nil +} + +func (e *escapeStreamer) handleRunes(data []byte, runes []rune, positions []int, types []runeType, confusables []rune, sb *strings.Builder) error { + for i, r := range runes { + switch types[i] { + case brokenRuneType: + if sb.Len() > 0 { + if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil { + return err + } + sb.Reset() + } + end := positions[i+1] + start := positions[i] + if err := e.brokenRune(data[start:end]); err != nil { + return err + } + case ambiguousRuneType: + if sb.Len() > 0 { + if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil { + return err + } + sb.Reset() + } + if err := e.ambiguousRune(r, confusables[0]); err != nil { + return err + } + confusables = confusables[1:] + case invisibleRuneType: + if sb.Len() > 0 { + if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil { + return err + } + sb.Reset() + } + if err := e.invisibleRune(r); err != nil { + return err + } + default: + _, _ = sb.WriteRune(r) + } + } + return nil +} + +func (e *escapeStreamer) brokenRune(bs []byte) error { + e.escaped.Escaped = true + e.escaped.HasBadRunes = true + + if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{ + Key: "class", + Val: "broken-code-point", + }); err != nil { + return err + } + if err := e.PassthroughHTMLStreamer.Text(fmt.Sprintf("<%X>", bs)); err != nil { + return err + } + + return e.PassthroughHTMLStreamer.EndTag("span") +} + +func (e *escapeStreamer) ambiguousRune(r, c rune) error { + e.escaped.Escaped = true + e.escaped.HasAmbiguous = true + + if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{ + Key: "class", + Val: "ambiguous-code-point", + }, html.Attribute{ + Key: "data-tooltip-content", + Val: e.locale.TrString("repo.ambiguous_character", r, c), + }); err != nil { + return err + } + if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{ + Key: "class", + Val: "char", + }); err != nil { + return err + } + if err := e.PassthroughHTMLStreamer.Text(string(r)); err != nil { + return err + } + if err := e.PassthroughHTMLStreamer.EndTag("span"); err != nil { + return err + } + + return e.PassthroughHTMLStreamer.EndTag("span") +} + +func (e *escapeStreamer) invisibleRune(r rune) error { + e.escaped.Escaped = true + e.escaped.HasInvisible = true + + if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{ + Key: "class", + Val: "escaped-code-point", + }, html.Attribute{ + Key: "data-escaped", + Val: fmt.Sprintf("[U+%04X]", r), + }); err != nil { + return err + } + if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{ + Key: "class", + Val: "char", + }); err != nil { + return err + } + if err := e.PassthroughHTMLStreamer.Text(string(r)); err != nil { + return err + } + if err := e.PassthroughHTMLStreamer.EndTag("span"); err != nil { + return err + } + + return e.PassthroughHTMLStreamer.EndTag("span") +} + +type runeCountType struct { + numBasicRunes int + numNonConfusingNonBasicRunes int + numAmbiguousRunes int + numInvisibleRunes int + numBrokenRunes int +} + +func (counts runeCountType) needsEscape() bool { + if counts.numBrokenRunes > 0 { + return true + } + if counts.numBasicRunes == 0 && + counts.numNonConfusingNonBasicRunes > 0 { + return false + } + return counts.numAmbiguousRunes > 0 || counts.numInvisibleRunes > 0 +} + +type runeType int + +const ( + basicASCIIRuneType runeType = iota // <- This is technically deadcode but its self-documenting so it should stay + brokenRuneType + nonBasicASCIIRuneType + ambiguousRuneType + invisibleRuneType +) + +func (e *escapeStreamer) runeTypes(runes ...rune) (types []runeType, confusables []rune, runeCounts runeCountType) { + types = make([]runeType, len(runes)) + for i, r := range runes { + var confusable rune + switch { + case r == utf8.RuneError: + types[i] = brokenRuneType + runeCounts.numBrokenRunes++ + case r == ' ' || r == '\t' || r == '\n': + runeCounts.numBasicRunes++ + case e.allowed[r]: + if r > 0x7e || r < 0x20 { + types[i] = nonBasicASCIIRuneType + runeCounts.numNonConfusingNonBasicRunes++ + } else { + runeCounts.numBasicRunes++ + } + case unicode.Is(InvisibleRanges, r): + types[i] = invisibleRuneType + runeCounts.numInvisibleRunes++ + case unicode.IsControl(r): + types[i] = invisibleRuneType + runeCounts.numInvisibleRunes++ + case isAmbiguous(r, &confusable, e.ambiguousTables...): + confusables = append(confusables, confusable) + types[i] = ambiguousRuneType + runeCounts.numAmbiguousRunes++ + case r > 0x7e || r < 0x20: + types[i] = nonBasicASCIIRuneType + runeCounts.numNonConfusingNonBasicRunes++ + default: + runeCounts.numBasicRunes++ + } + } + return types, confusables, runeCounts +} diff --git a/modules/charset/escape_test.go b/modules/charset/escape_test.go new file mode 100644 index 0000000..2ca76f8 --- /dev/null +++ b/modules/charset/escape_test.go @@ -0,0 +1,194 @@ +// Copyright 2021 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package charset + +import ( + "html/template" + "strings" + "testing" + + "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/modules/test" + "code.gitea.io/gitea/modules/translation" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +var testContext = escapeContext("test") + +type escapeControlTest struct { + name string + text string + status EscapeStatus + result string +} + +var escapeControlTests = []escapeControlTest{ + { + name: "<empty>", + }, + { + name: "single line western", + text: "single line western", + result: "single line western", + status: EscapeStatus{}, + }, + { + name: "multi line western", + text: "single line western\nmulti line western\n", + result: "single line western\nmulti line western\n", + status: EscapeStatus{}, + }, + { + name: "multi line western non-breaking space", + text: "single line western\nmulti line western\n", + result: `single line<span class="escaped-code-point" data-escaped="[U+00A0]"><span class="char"> </span></span>western` + "\n" + `multi line<span class="escaped-code-point" data-escaped="[U+00A0]"><span class="char"> </span></span>western` + "\n", + status: EscapeStatus{Escaped: true, HasInvisible: true}, + }, + { + name: "mixed scripts: western + japanese", + text: "日属秘ぞしちゅ。Then some western.", + result: "日属秘ぞしちゅ。Then some western.", + status: EscapeStatus{}, + }, + { + name: "japanese", + text: "日属秘ぞしちゅ。", + result: "日属秘ぞしちゅ。", + status: EscapeStatus{}, + }, + { + name: "hebrew", + text: "עד תקופת יוון העתיקה היה העיסוק במתמטיקה תכליתי בלבד: היא שימשה כאוסף של נוסחאות לחישוב קרקע, אוכלוסין וכו'. פריצת הדרך של היוונים, פרט לתרומותיהם הגדולות לידע המתמטי, הייתה בלימוד המתמטיקה כשלעצמה, מתוקף ערכה הרוחני. יחסם של חלק מהיוונים הקדמונים למתמטיקה היה דתי - למשל, הכת שאסף סביבו פיתגורס האמינה כי המתמטיקה היא הבסיס לכל הדברים. היוונים נחשבים ליוצרי מושג ההוכחה המתמטית, וכן לראשונים שעסקו במתמטיקה לשם עצמה, כלומר כתחום מחקרי עיוני ומופשט ולא רק כעזר שימושי. עם זאת, לצדה", + result: `עד תקופת <span class="ambiguous-code-point" data-tooltip-content="repo.ambiguous_character"><span class="char">י</span></span><span class="ambiguous-code-point" data-tooltip-content="repo.ambiguous_character"><span class="char">ו</span></span><span class="ambiguous-code-point" data-tooltip-content="repo.ambiguous_character"><span class="char">ו</span></span><span class="ambiguous-code-point" data-tooltip-content="repo.ambiguous_character"><span class="char">ן</span></span> העתיקה היה העיסוק במתמטיקה תכליתי בלבד: היא שימשה כאוסף של נוסחאות לחישוב קרקע, אוכלוסין וכו'. פריצת הדרך של היוונים, פרט לתרומותיהם הגדולות לידע המתמטי, הייתה בלימוד המתמטיקה כשלעצמה, מתוקף ערכה הרוחני. יחסם של חלק מהיוונים הקדמונים למתמטיקה היה דתי - למשל, הכת שאסף סביבו פיתגורס האמינה כי המתמטיקה היא הבסיס לכל הדברים. היוונים נחשבים ליוצרי מושג ההוכחה המתמטית, וכן לראשונים שעסקו במתמטיקה לשם עצמה, כלומר כתחום מחקרי עיוני ומופשט ולא רק כעזר שימושי. עם זאת, לצדה`, + status: EscapeStatus{Escaped: true, HasAmbiguous: true}, + }, + { + name: "more hebrew", + text: `בתקופה מאוחרת יותר, השתמשו היוונים בשיטת סימון מתקדמת יותר, שבה הוצגו המספרים לפי 22 אותיות האלפבית היווני. לסימון המספרים בין 1 ל-9 נקבעו תשע האותיות הראשונות, בתוספת גרש ( ' ) בצד ימין של האות, למעלה; תשע האותיות הבאות ייצגו את העשרות מ-10 עד 90, והבאות את המאות. לסימון הספרות בין 1000 ל-900,000, השתמשו היוונים באותן אותיות, אך הוסיפו לאותיות את הגרש דווקא מצד שמאל של האותיות, למטה. ממיליון ומעלה, כנראה השתמשו היוונים בשני תגים במקום אחד. + + המתמטיקאי הבולט הראשון ביוון העתיקה, ויש האומרים בתולדות האנושות, הוא תאלס (624 לפנה"ס - 546 לפנה"ס בקירוב).[1] לא יהיה זה משולל יסוד להניח שהוא האדם הראשון שהוכיח משפט מתמטי, ולא רק גילה אותו. תאלס הוכיח שישרים מקבילים חותכים מצד אחד של שוקי זווית קטעים בעלי יחסים שווים (משפט תאלס הראשון), שהזווית המונחת על קוטר במעגל היא זווית ישרה (משפט תאלס השני), שהקוטר מחלק את המעגל לשני חלקים שווים, ושזוויות הבסיס במשולש שווה-שוקיים שוות זו לזו. מיוחסות לו גם שיטות למדידת גובהן של הפירמידות בעזרת מדידת צילן ולקביעת מיקומה של ספינה הנראית מן החוף. + + בשנים 582 לפנה"ס עד 496 לפנה"ס, בקירוב, חי מתמטיקאי חשוב במיוחד - פיתגורס. המקורות הראשוניים עליו מועטים, וההיסטוריונים מתקשים להפריד את העובדות משכבת המסתורין והאגדות שנקשרו בו. ידוע שסביבו התקבצה האסכולה הפיתגוראית מעין כת פסבדו-מתמטית שהאמינה ש"הכל מספר", או ליתר דיוק הכל ניתן לכימות, וייחסה למספרים משמעויות מיסטיות. ככל הנראה הפיתגוראים ידעו לבנות את הגופים האפלטוניים, הכירו את הממוצע האריתמטי, הממוצע הגאומטרי והממוצע ההרמוני והגיעו להישגים חשובים נוספים. ניתן לומר שהפיתגוראים גילו את היותו של השורש הריבועי של 2, שהוא גם האלכסון בריבוע שאורך צלעותיו 1, אי רציונלי, אך תגליתם הייתה למעשה רק שהקטעים "חסרי מידה משותפת", ומושג המספר האי רציונלי מאוחר יותר.[2] אזכור ראשון לקיומם של קטעים חסרי מידה משותפת מופיע בדיאלוג "תאיטיטוס" של אפלטון, אך רעיון זה היה מוכר עוד קודם לכן, במאה החמישית לפנה"ס להיפאסוס, בן האסכולה הפיתגוראית, ואולי לפיתגורס עצמו.[3]`, + result: `בתקופה מאוחרת יותר, השתמשו היוונים בשיטת סימון מתקדמת יותר, שבה הוצגו המספרים לפי 22 אותיות האלפבית היווני. לסימון המספרים בין 1 ל-9 נקבעו תשע האותיות הראשונות, בתוספת גרש ( ' ) בצד ימין של האות, למעלה; תשע האותיות הבאות ייצגו את העשרות מ-10 עד 90, והבאות את המאות. לסימון הספרות בין 1000 ל-900,000, השתמשו היוונים באותן אותיות, אך הוסיפו לאותיות את הגרש דווקא מצד שמאל של האותיות, למטה. ממיליון ומעלה, כנראה השתמשו היוונים בשני תגים במקום אחד. + + המתמטיקאי הבולט הראשון ביוון העתיקה, ויש האומרים בתולדות האנושות, הוא תאלס (624 לפנה"<span class="ambiguous-code-point" data-tooltip-content="repo.ambiguous_character"><span class="char">ס</span></span> - 546 לפנה"<span class="ambiguous-code-point" data-tooltip-content="repo.ambiguous_character"><span class="char">ס</span></span> בקירוב).[1] לא יהיה זה משולל יסוד להניח שהוא האדם הראשון שהוכיח משפט מתמטי, ולא רק גילה אותו. תאלס הוכיח שישרים מקבילים חותכים מצד אחד של שוקי זווית קטעים בעלי יחסים שווים (משפט תאלס הראשון), שהזווית המונחת על קוטר במעגל היא זווית ישרה (משפט תאלס השני), שהקוטר מחלק את המעגל לשני חלקים שווים, ושזוויות הבסיס במשולש שווה-שוקיים שוות זו לזו. מיוחסות לו גם שיטות למדידת גובהן של הפירמידות בעזרת מדידת צילן ולקביעת מיקומה של ספינה הנראית מן החוף. + + בשנים 582 לפנה"<span class="ambiguous-code-point" data-tooltip-content="repo.ambiguous_character"><span class="char">ס</span></span> עד 496 לפנה"<span class="ambiguous-code-point" data-tooltip-content="repo.ambiguous_character"><span class="char">ס</span></span>, בקירוב, חי מתמטיקאי חשוב במיוחד - פיתגורס. המקורות הראשוניים עליו מועטים, וההיסטוריונים מתקשים להפריד את העובדות משכבת המסתורין והאגדות שנקשרו בו. ידוע שסביבו התקבצה האסכולה הפיתגוראית מעין כת פסבדו-מתמטית שהאמינה ש"הכל מספר", או ליתר דיוק הכל ניתן לכימות, וייחסה למספרים משמעויות מיסטיות. ככל הנראה הפיתגוראים ידעו לבנות את הגופים האפלטוניים, הכירו את הממוצע האריתמטי, הממוצע הגאומטרי והממוצע ההרמוני והגיעו להישגים חשובים נוספים. ניתן לומר שהפיתגוראים גילו את היותו של השורש הריבועי של 2, שהוא גם האלכסון בריבוע שאורך צלעותיו 1, אי רציונלי, אך תגליתם הייתה למעשה רק שהקטעים "חסרי מידה משותפת", ומושג המספר האי רציונלי מאוחר יותר.[2] אזכור ראשון לקיומם של קטעים חסרי מידה משותפת מופיע בדיאלוג "תאיטיטוס" של אפלטון, אך רעיון זה היה מוכר עוד קודם לכן, במאה החמישית לפנה"<span class="ambiguous-code-point" data-tooltip-content="repo.ambiguous_character"><span class="char">ס</span></span> להיפאסוס, בן האסכולה הפיתגוראית, ואולי לפיתגורס עצמו.[3]`, + status: EscapeStatus{Escaped: true, HasAmbiguous: true}, + }, + { + name: "Mixed RTL+LTR", + text: `Many computer programs fail to display bidirectional text correctly. +For example, the Hebrew name Sarah (שרה) is spelled: sin (ש) (which appears rightmost), +then resh (ר), and finally heh (ה) (which should appear leftmost).`, + result: `Many computer programs fail to display bidirectional text correctly. +For example, the Hebrew name Sarah (שרה) is spelled: sin (ש) (which appears rightmost), +then resh (ר), and finally heh (ה) (which should appear leftmost).`, + status: EscapeStatus{}, + }, + { + name: "Mixed RTL+LTR+BIDI", + text: `Many computer programs fail to display bidirectional text correctly. + For example, the Hebrew name Sarah ` + "\u2067" + `שרה` + "\u2066\n" + + `sin (ש) (which appears rightmost), then resh (ר), and finally heh (ה) (which should appear leftmost).`, + result: `Many computer programs fail to display bidirectional text correctly. + For example, the Hebrew name Sarah ` + "\u2067" + `שרה` + "\u2066\n" + + `sin (ש) (which appears rightmost), then resh (ר), and finally heh (ה) (which should appear leftmost).`, + status: EscapeStatus{}, + }, + { + name: "Accented characters", + text: string([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}), + result: string([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}), + status: EscapeStatus{}, + }, + { + name: "Program", + text: "string([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})", + result: "string([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})", + status: EscapeStatus{}, + }, + { + name: "CVE testcase", + text: "if access_level != \"user\u202E \u2066// Check if admin\u2069 \u2066\" {", + result: `if access_level != "user<span class="escaped-code-point" data-escaped="[U+202E]"><span class="char">` + "\u202e" + `</span></span> <span class="escaped-code-point" data-escaped="[U+2066]"><span class="char">` + "\u2066" + `</span></span>// Check if admin<span class="escaped-code-point" data-escaped="[U+2069]"><span class="char">` + "\u2069" + `</span></span> <span class="escaped-code-point" data-escaped="[U+2066]"><span class="char">` + "\u2066" + `</span></span>" {`, + status: EscapeStatus{Escaped: true, HasInvisible: true}, + }, + { + name: "Mixed testcase with fail", + text: `Many computer programs fail to display bidirectional text correctly. + For example, the Hebrew name Sarah ` + "\u2067" + `שרה` + "\u2066\n" + + `sin (ש) (which appears rightmost), then resh (ר), and finally heh (ה) (which should appear leftmost).` + + "\nif access_level != \"user\u202E \u2066// Check if admin\u2069 \u2066\" {\n", + result: `Many computer programs fail to display bidirectional text correctly. + For example, the Hebrew name Sarah ` + "\u2067" + `שרה` + "\u2066\n" + + `sin (ש) (which appears rightmost), then resh (ר), and finally heh (ה) (which should appear leftmost).` + + "\n" + `if access_level != "user<span class="escaped-code-point" data-escaped="[U+202E]"><span class="char">` + "\u202e" + `</span></span> <span class="escaped-code-point" data-escaped="[U+2066]"><span class="char">` + "\u2066" + `</span></span>// Check if admin<span class="escaped-code-point" data-escaped="[U+2069]"><span class="char">` + "\u2069" + `</span></span> <span class="escaped-code-point" data-escaped="[U+2066]"><span class="char">` + "\u2066" + `</span></span>" {` + "\n", + status: EscapeStatus{Escaped: true, HasInvisible: true}, + }, + { + // UTF-8/16/32 all use the same codepoint for BOM + // Forgejo could read UTF-16/32 content and convert into UTF-8 internally then render it, so we only process UTF-8 internally + name: "UTF BOM", + text: "\xef\xbb\xbftest", + result: "\xef\xbb\xbftest", + status: EscapeStatus{}, + }, +} + +func TestEscapeControlReader(t *testing.T) { + // add some control characters to the tests + tests := make([]escapeControlTest, 0, len(escapeControlTests)*3) + copy(tests, escapeControlTests) + + // if there is a BOM, we should keep the BOM + addPrefix := func(prefix, s string) string { + if strings.HasPrefix(s, "\xef\xbb\xbf") { + return s[:3] + prefix + s[3:] + } + return prefix + s + } + for _, test := range escapeControlTests { + test.name += " (+Control)" + test.text = addPrefix("\u001E", test.text) + test.result = addPrefix(`<span class="escaped-code-point" data-escaped="[U+001E]"><span class="char">`+"\u001e"+`</span></span>`, test.result) + test.status.Escaped = true + test.status.HasInvisible = true + tests = append(tests, test) + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + output := &strings.Builder{} + status, err := EscapeControlReader(strings.NewReader(tt.text), output, &translation.MockLocale{}, testContext) + require.NoError(t, err) + assert.Equal(t, tt.status, *status) + assert.Equal(t, tt.result, output.String()) + }) + } +} + +func TestSettingAmbiguousUnicodeDetection(t *testing.T) { + defer test.MockVariableValue(&setting.UI.AmbiguousUnicodeDetection, true)() + + _, out := EscapeControlHTML("a test", &translation.MockLocale{}, testContext) + assert.EqualValues(t, `a<span class="escaped-code-point" data-escaped="[U+00A0]"><span class="char"> </span></span>test`, out) + setting.UI.AmbiguousUnicodeDetection = false + _, out = EscapeControlHTML("a test", &translation.MockLocale{}, testContext) + assert.EqualValues(t, `a test`, out) +} + +func TestAmbiguousUnicodeDetectionContext(t *testing.T) { + defer test.MockVariableValue(&setting.UI.SkipEscapeContexts, []string{"test"})() + + input := template.HTML("a test") + + _, out := EscapeControlHTML(input, &translation.MockLocale{}, escapeContext("not-test")) + assert.EqualValues(t, `a<span class="escaped-code-point" data-escaped="[U+00A0]"><span class="char"> </span></span>test`, out) + + _, out = EscapeControlHTML(input, &translation.MockLocale{}, testContext) + assert.EqualValues(t, input, out) +} diff --git a/modules/charset/htmlstream.go b/modules/charset/htmlstream.go new file mode 100644 index 0000000..61f2912 --- /dev/null +++ b/modules/charset/htmlstream.go @@ -0,0 +1,200 @@ +// Copyright 2022 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package charset + +import ( + "fmt" + "io" + + "golang.org/x/net/html" +) + +// HTMLStreamer represents a SAX-like interface for HTML +type HTMLStreamer interface { + Error(err error) error + Doctype(data string) error + Comment(data string) error + StartTag(data string, attrs ...html.Attribute) error + SelfClosingTag(data string, attrs ...html.Attribute) error + EndTag(data string) error + Text(data string) error +} + +// PassthroughHTMLStreamer is a passthrough streamer +type PassthroughHTMLStreamer struct { + next HTMLStreamer +} + +func NewPassthroughStreamer(next HTMLStreamer) *PassthroughHTMLStreamer { + return &PassthroughHTMLStreamer{next: next} +} + +var _ (HTMLStreamer) = &PassthroughHTMLStreamer{} + +// Error tells the next streamer in line that there is an error +func (p *PassthroughHTMLStreamer) Error(err error) error { + return p.next.Error(err) +} + +// Doctype tells the next streamer what the doctype is +func (p *PassthroughHTMLStreamer) Doctype(data string) error { + return p.next.Doctype(data) +} + +// Comment tells the next streamer there is a comment +func (p *PassthroughHTMLStreamer) Comment(data string) error { + return p.next.Comment(data) +} + +// StartTag tells the next streamer there is a starting tag +func (p *PassthroughHTMLStreamer) StartTag(data string, attrs ...html.Attribute) error { + return p.next.StartTag(data, attrs...) +} + +// SelfClosingTag tells the next streamer there is a self-closing tag +func (p *PassthroughHTMLStreamer) SelfClosingTag(data string, attrs ...html.Attribute) error { + return p.next.SelfClosingTag(data, attrs...) +} + +// EndTag tells the next streamer there is a end tag +func (p *PassthroughHTMLStreamer) EndTag(data string) error { + return p.next.EndTag(data) +} + +// Text tells the next streamer there is a text +func (p *PassthroughHTMLStreamer) Text(data string) error { + return p.next.Text(data) +} + +// HTMLStreamWriter acts as a writing sink +type HTMLStreamerWriter struct { + io.Writer + err error +} + +// Write implements io.Writer +func (h *HTMLStreamerWriter) Write(data []byte) (int, error) { + if h.err != nil { + return 0, h.err + } + return h.Writer.Write(data) +} + +// Write implements io.StringWriter +func (h *HTMLStreamerWriter) WriteString(data string) (int, error) { + if h.err != nil { + return 0, h.err + } + return h.Writer.Write([]byte(data)) +} + +// Error tells the next streamer in line that there is an error +func (h *HTMLStreamerWriter) Error(err error) error { + if h.err == nil { + h.err = err + } + return h.err +} + +// Doctype tells the next streamer what the doctype is +func (h *HTMLStreamerWriter) Doctype(data string) error { + _, h.err = h.WriteString("<!DOCTYPE " + data + ">") + return h.err +} + +// Comment tells the next streamer there is a comment +func (h *HTMLStreamerWriter) Comment(data string) error { + _, h.err = h.WriteString("<!--" + data + "-->") + return h.err +} + +// StartTag tells the next streamer there is a starting tag +func (h *HTMLStreamerWriter) StartTag(data string, attrs ...html.Attribute) error { + return h.startTag(data, attrs, false) +} + +// SelfClosingTag tells the next streamer there is a self-closing tag +func (h *HTMLStreamerWriter) SelfClosingTag(data string, attrs ...html.Attribute) error { + return h.startTag(data, attrs, true) +} + +func (h *HTMLStreamerWriter) startTag(data string, attrs []html.Attribute, selfclosing bool) error { + if _, h.err = h.WriteString("<" + data); h.err != nil { + return h.err + } + for _, attr := range attrs { + if _, h.err = h.WriteString(" " + attr.Key + "=\"" + html.EscapeString(attr.Val) + "\""); h.err != nil { + return h.err + } + } + if selfclosing { + if _, h.err = h.WriteString("/>"); h.err != nil { + return h.err + } + } else { + if _, h.err = h.WriteString(">"); h.err != nil { + return h.err + } + } + return h.err +} + +// EndTag tells the next streamer there is a end tag +func (h *HTMLStreamerWriter) EndTag(data string) error { + _, h.err = h.WriteString("</" + data + ">") + return h.err +} + +// Text tells the next streamer there is a text +func (h *HTMLStreamerWriter) Text(data string) error { + _, h.err = h.WriteString(html.EscapeString(data)) + return h.err +} + +// StreamHTML streams an html to a provided streamer +func StreamHTML(source io.Reader, streamer HTMLStreamer) error { + tokenizer := html.NewTokenizer(source) + for { + tt := tokenizer.Next() + switch tt { + case html.ErrorToken: + if tokenizer.Err() != io.EOF { + return tokenizer.Err() + } + return nil + case html.DoctypeToken: + token := tokenizer.Token() + if err := streamer.Doctype(token.Data); err != nil { + return err + } + case html.CommentToken: + token := tokenizer.Token() + if err := streamer.Comment(token.Data); err != nil { + return err + } + case html.StartTagToken: + token := tokenizer.Token() + if err := streamer.StartTag(token.Data, token.Attr...); err != nil { + return err + } + case html.SelfClosingTagToken: + token := tokenizer.Token() + if err := streamer.StartTag(token.Data, token.Attr...); err != nil { + return err + } + case html.EndTagToken: + token := tokenizer.Token() + if err := streamer.EndTag(token.Data); err != nil { + return err + } + case html.TextToken: + token := tokenizer.Token() + if err := streamer.Text(token.Data); err != nil { + return err + } + default: + return fmt.Errorf("unknown type of token: %d", tt) + } + } +} diff --git a/modules/charset/invisible/generate.go b/modules/charset/invisible/generate.go new file mode 100644 index 0000000..bd57dd6 --- /dev/null +++ b/modules/charset/invisible/generate.go @@ -0,0 +1,121 @@ +// Copyright 2022 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package main + +import ( + "bytes" + "flag" + "fmt" + "go/format" + "os" + "text/template" + + "golang.org/x/text/unicode/rangetable" +) + +// InvisibleRunes these are runes that vscode has assigned to be invisible +// See https://github.com/hediet/vscode-unicode-data +var InvisibleRunes = []rune{ + 9, 10, 11, 12, 13, 32, 127, 160, 173, 847, 1564, 4447, 4448, 6068, 6069, 6155, 6156, 6157, 6158, 7355, 7356, 8192, 8193, 8194, 8195, 8196, 8197, 8198, 8199, 8200, 8201, 8202, 8203, 8204, 8205, 8206, 8207, 8234, 8235, 8236, 8237, 8238, 8239, 8287, 8288, 8289, 8290, 8291, 8292, 8293, 8294, 8295, 8296, 8297, 8298, 8299, 8300, 8301, 8302, 8303, 10240, 12288, 12644, 65024, 65025, 65026, 65027, 65028, 65029, 65030, 65031, 65032, 65033, 65034, 65035, 65036, 65037, 65038, 65039, 65279, 65440, 65520, 65521, 65522, 65523, 65524, 65525, 65526, 65527, 65528, 65532, 78844, 119155, 119156, 119157, 119158, 119159, 119160, 119161, 119162, 917504, 917505, 917506, 917507, 917508, 917509, 917510, 917511, 917512, 917513, 917514, 917515, 917516, 917517, 917518, 917519, 917520, 917521, 917522, 917523, 917524, 917525, 917526, 917527, 917528, 917529, 917530, 917531, 917532, 917533, 917534, 917535, 917536, 917537, 917538, 917539, 917540, 917541, 917542, 917543, 917544, 917545, 917546, 917547, 917548, 917549, 917550, 917551, 917552, 917553, 917554, 917555, 917556, 917557, 917558, 917559, 917560, 917561, 917562, 917563, 917564, 917565, 917566, 917567, 917568, 917569, 917570, 917571, 917572, 917573, 917574, 917575, 917576, 917577, 917578, 917579, 917580, 917581, 917582, 917583, 917584, 917585, 917586, 917587, 917588, 917589, 917590, 917591, 917592, 917593, 917594, 917595, 917596, 917597, 917598, 917599, 917600, 917601, 917602, 917603, 917604, 917605, 917606, 917607, 917608, 917609, 917610, 917611, 917612, 917613, 917614, 917615, 917616, 917617, 917618, 917619, 917620, 917621, 917622, 917623, 917624, 917625, 917626, 917627, 917628, 917629, 917630, 917631, 917760, 917761, 917762, 917763, 917764, 917765, 917766, 917767, 917768, 917769, 917770, 917771, 917772, 917773, 917774, 917775, 917776, 917777, 917778, 917779, 917780, 917781, 917782, 917783, 917784, 917785, 917786, 917787, 917788, 917789, 917790, 917791, 917792, 917793, 917794, 917795, 917796, 917797, 917798, 917799, 917800, 917801, 917802, 917803, 917804, 917805, 917806, 917807, 917808, 917809, 917810, 917811, 917812, 917813, 917814, 917815, 917816, 917817, 917818, 917819, 917820, 917821, 917822, 917823, 917824, 917825, 917826, 917827, 917828, 917829, 917830, 917831, 917832, 917833, 917834, 917835, 917836, 917837, 917838, 917839, 917840, 917841, 917842, 917843, 917844, 917845, 917846, 917847, 917848, 917849, 917850, 917851, 917852, 917853, 917854, 917855, 917856, 917857, 917858, 917859, 917860, 917861, 917862, 917863, 917864, 917865, 917866, 917867, 917868, 917869, 917870, 917871, 917872, 917873, 917874, 917875, 917876, 917877, 917878, 917879, 917880, 917881, 917882, 917883, 917884, 917885, 917886, 917887, 917888, 917889, 917890, 917891, 917892, 917893, 917894, 917895, 917896, 917897, 917898, 917899, 917900, 917901, 917902, 917903, 917904, 917905, 917906, 917907, 917908, 917909, 917910, 917911, 917912, 917913, 917914, 917915, 917916, 917917, 917918, 917919, 917920, 917921, 917922, 917923, 917924, 917925, 917926, 917927, 917928, 917929, 917930, 917931, 917932, 917933, 917934, 917935, 917936, 917937, 917938, 917939, 917940, 917941, 917942, 917943, 917944, 917945, 917946, 917947, 917948, 917949, 917950, 917951, 917952, 917953, 917954, 917955, 917956, 917957, 917958, 917959, 917960, 917961, 917962, 917963, 917964, 917965, 917966, 917967, 917968, 917969, 917970, 917971, 917972, 917973, 917974, 917975, 917976, 917977, 917978, 917979, 917980, 917981, 917982, 917983, 917984, 917985, 917986, 917987, 917988, 917989, 917990, 917991, 917992, 917993, 917994, 917995, 917996, 917997, 917998, 917999, +} + +var verbose bool + +func main() { + flag.Usage = func() { + fmt.Fprintf(os.Stderr, `%s: Generate InvisibleRunesRange + +Usage: %[1]s [-v] [-o output.go] +`, os.Args[0]) + flag.PrintDefaults() + } + + output := "" + flag.BoolVar(&verbose, "v", false, "verbose output") + flag.StringVar(&output, "o", "invisible_gen.go", "file to output to") + flag.Parse() + + // First we filter the runes to remove + // <space><tab><newline> + filtered := make([]rune, 0, len(InvisibleRunes)) + for _, r := range InvisibleRunes { + if r == ' ' || r == '\t' || r == '\n' { + continue + } + filtered = append(filtered, r) + } + + table := rangetable.New(filtered...) + if err := runTemplate(generatorTemplate, output, table); err != nil { + fatalf("Unable to run template: %v", err) + } +} + +func runTemplate(t *template.Template, filename string, data any) error { + buf := bytes.NewBuffer(nil) + if err := t.Execute(buf, data); err != nil { + return fmt.Errorf("unable to execute template: %w", err) + } + bs, err := format.Source(buf.Bytes()) + if err != nil { + verbosef("Bad source:\n%s", buf.String()) + return fmt.Errorf("unable to format source: %w", err) + } + + old, err := os.ReadFile(filename) + if err != nil && !os.IsNotExist(err) { + return fmt.Errorf("failed to read old file %s because %w", filename, err) + } else if err == nil { + if bytes.Equal(bs, old) { + // files are the same don't rewrite it. + return nil + } + } + + file, err := os.Create(filename) + if err != nil { + return fmt.Errorf("failed to create file %s because %w", filename, err) + } + defer file.Close() + _, err = file.Write(bs) + if err != nil { + return fmt.Errorf("unable to write generated source: %w", err) + } + return nil +} + +var generatorTemplate = template.Must(template.New("invisibleTemplate").Parse(`// This file is generated by modules/charset/invisible/generate.go DO NOT EDIT +// Copyright 2022 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + + +package charset + +import "unicode" + +var InvisibleRanges = &unicode.RangeTable{ + R16: []unicode.Range16{ +{{range .R16 }} {Lo:{{.Lo}}, Hi:{{.Hi}}, Stride: {{.Stride}}}, +{{end}} }, + R32: []unicode.Range32{ +{{range .R32}} {Lo:{{.Lo}}, Hi:{{.Hi}}, Stride: {{.Stride}}}, +{{end}} }, + LatinOffset: {{.LatinOffset}}, +} +`)) + +func logf(format string, args ...any) { + fmt.Fprintf(os.Stderr, format+"\n", args...) +} + +func verbosef(format string, args ...any) { + if verbose { + logf(format, args...) + } +} + +func fatalf(format string, args ...any) { + logf("fatal: "+format+"\n", args...) + os.Exit(1) +} diff --git a/modules/charset/invisible_gen.go b/modules/charset/invisible_gen.go new file mode 100644 index 0000000..812f0e3 --- /dev/null +++ b/modules/charset/invisible_gen.go @@ -0,0 +1,36 @@ +// This file is generated by modules/charset/invisible/generate.go DO NOT EDIT +// Copyright 2022 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package charset + +import "unicode" + +var InvisibleRanges = &unicode.RangeTable{ + R16: []unicode.Range16{ + {Lo: 11, Hi: 13, Stride: 1}, + {Lo: 127, Hi: 160, Stride: 33}, + {Lo: 173, Hi: 847, Stride: 674}, + {Lo: 1564, Hi: 4447, Stride: 2883}, + {Lo: 4448, Hi: 6068, Stride: 1620}, + {Lo: 6069, Hi: 6155, Stride: 86}, + {Lo: 6156, Hi: 6158, Stride: 1}, + {Lo: 7355, Hi: 7356, Stride: 1}, + {Lo: 8192, Hi: 8207, Stride: 1}, + {Lo: 8234, Hi: 8239, Stride: 1}, + {Lo: 8287, Hi: 8303, Stride: 1}, + {Lo: 10240, Hi: 12288, Stride: 2048}, + {Lo: 12644, Hi: 65024, Stride: 52380}, + {Lo: 65025, Hi: 65039, Stride: 1}, + {Lo: 65279, Hi: 65440, Stride: 161}, + {Lo: 65520, Hi: 65528, Stride: 1}, + {Lo: 65532, Hi: 65532, Stride: 1}, + }, + R32: []unicode.Range32{ + {Lo: 78844, Hi: 119155, Stride: 40311}, + {Lo: 119156, Hi: 119162, Stride: 1}, + {Lo: 917504, Hi: 917631, Stride: 1}, + {Lo: 917760, Hi: 917999, Stride: 1}, + }, + LatinOffset: 2, +} |