From 461b008e5ff045cc9573ce04374ab8825a36f1a0 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 8 Nov 2019 22:40:54 -0600 Subject: [PATCH 01/34] JavaScript: Stop breaking surrogate pairs in toDelta() Resolves #69 for JavaScript Sometimes we can find a common prefix that runs into the middle of a surrogate pair and we split that pair when building our diff groups. This is fine as long as we are operating on UTF-16 code units. It becomes problematic when we start trying to treat those substrings as valid Unicode (or UTF-8) sequences. When we pass these split groups into `toDelta()` we do just that and the library crashes. In this patch we're post-processing the diff groups before encoding them to make sure that we un-split the surrogate pairs. The post-processed diffs should produce the same output when applying the diffs. The diff string itself will be different but should change that much - only by a single character at surrogate boundaries. --- javascript/diff_match_patch_uncompressed.js | 25 +++++++++ javascript/tests/diff_match_patch_test.js | 58 +++++++++++++++++++++ 2 files changed, 83 insertions(+) diff --git a/javascript/diff_match_patch_uncompressed.js b/javascript/diff_match_patch_uncompressed.js index 88a702c..e8bb278 100644 --- a/javascript/diff_match_patch_uncompressed.js +++ b/javascript/diff_match_patch_uncompressed.js @@ -1339,6 +1339,15 @@ diff_match_patch.prototype.diff_levenshtein = function(diffs) { return levenshtein; }; +diff_match_patch.prototype.isHighSurrogate = function(c) { + var v = c.charCodeAt(0); + return v >= 0xD800 && v <= 0xDBFF; +} + +diff_match_patch.prototype.isLowSurrogate = function(c) { + var v = c.charCodeAt(0); + return v >= 0xDC00 && v <= 0xDFFF; +} /** * Crush the diff into an encoded string which describes the operations @@ -1350,7 +1359,23 @@ diff_match_patch.prototype.diff_levenshtein = function(diffs) { */ diff_match_patch.prototype.diff_toDelta = function(diffs) { var text = []; + var lastEnd; for (var x = 0; x < diffs.length; x++) { + + var thisDiff = diffs[x]; + var thisTop = thisDiff[1][0]; + var thisEnd = thisDiff[1][thisDiff[1].length - 1]; + + if (thisEnd && this.isHighSurrogate(thisEnd)) { + thisDiff[1] = thisDiff[1].slice(0, -1); + } + + if (lastEnd && thisTop && this.isHighSurrogate(lastEnd) && this.isLowSurrogate(thisTop)) { + thisDiff[1] = lastEnd + thisDiff[1]; + } + + lastEnd = thisEnd; + switch (diffs[x][0]) { case DIFF_INSERT: text[x] = '+' + encodeURI(diffs[x][1]); diff --git a/javascript/tests/diff_match_patch_test.js b/javascript/tests/diff_match_patch_test.js index 109e56a..c48f3bb 100644 --- a/javascript/tests/diff_match_patch_test.js +++ b/javascript/tests/diff_match_patch_test.js @@ -492,6 +492,64 @@ function testDiffDelta() { // Convert delta string into a diff. assertEquivalent(diffs, dmp.diff_fromDelta(text1, delta)); + diffs = [[DIFF_EQUAL, '\ud83d\ude4b\ud83d'], [DIFF_INSERT, '\ude4c\ud83d'], [DIFF_EQUAL, '\ude4b']]; + try { + delta = dmp.diff_toDelta(diffs); + assertEquals('=2\t+%F0%9F%99%8C\t=2', delta); + } catch ( e ) { + assertEquals(false, true); + } + + (function(){ + const originalText = `U+1F17x πŸ…°οΈ πŸ…±οΈ πŸ…ΎοΈ πŸ…ΏοΈ safhawifhkw + U+1F18x πŸ†Ž + 0 1 2 3 4 5 6 7 8 9 A B C D E F + U+1F19x πŸ†‘ πŸ†’ πŸ†“ πŸ†” πŸ†• πŸ†– πŸ†— πŸ†˜ πŸ†™ πŸ†š + U+1F20x 🈁 πŸˆ‚οΈ sfss.,_||saavvvbbds + U+1F21x 🈚 + U+1F22x 🈯 + U+1F23x 🈲 🈳 🈴 🈡 🈢 🈷️ 🈸 🈹 🈺 + U+1F25x πŸ‰ πŸ‰‘ + U+1F30x πŸŒ€ 🌁 πŸŒ‚ πŸŒƒ πŸŒ„ πŸŒ… πŸŒ† πŸŒ‡ 🌈 πŸŒ‰ 🌊 πŸŒ‹ 🌌 🌍 🌎 🌏 + U+1F31x 🌐 πŸŒ‘ πŸŒ’ πŸŒ“ πŸŒ” πŸŒ• πŸŒ– πŸŒ— 🌘 πŸŒ™ 🌚 πŸŒ› 🌜 🌝 🌞 `; + + // applies some random edits to string and returns new, edited string + function applyRandomTextEdit(text) { + let textArr = [...text]; + let r = Math.random(); + if(r < 1/3) { // swap + let swapCount = Math.floor(Math.random()*5); + for(let i = 0; i < swapCount; i++) { + let swapPos1 = Math.floor(Math.random()*textArr.length); + let swapPos2 = Math.floor(Math.random()*textArr.length); + let char1 = textArr[swapPos1]; + let char2 = textArr[swapPos2]; + textArr[swapPos1] = char2; + textArr[swapPos2] = char1; + } + } else if(r < 2/3) { // remove + let removeCount = Math.floor(Math.random()*5); + for(let i = 0; i < removeCount; i++) { + let removePos = Math.floor(Math.random()*textArr.length); + textArr[removePos] = ""; + } + } else { // add + let addCount = Math.floor(Math.random()*5); + for(let i = 0; i < addCount; i++) { + let addPos = Math.floor(Math.random()*textArr.length); + let addFromPos = Math.floor(Math.random()*textArr.length); + textArr[addPos] = textArr[addPos] + textArr[addFromPos]; + } + } + return textArr.join(""); + } + + for(let i = 0; i < 1000; i++) { + newText = applyRandomTextEdit(originalText); + dmp.patch_toText(dmp.patch_make(originalText, newText)); + } + }); + // Verify pool of unchanged characters. diffs = [[DIFF_INSERT, 'A-Z a-z 0-9 - _ . ! ~ * \' ( ) ; / ? : @ & = + $ , # ']]; var text2 = dmp.diff_text2(diffs); From 71646fbcff9cab79983bf1aa04dbfabd23d63aa4 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Sat, 9 Nov 2019 14:54:38 -0600 Subject: [PATCH 02/34] Add fixes for Java client --- .../neil/plaintext/diff_match_patch.java | 18 ++++++++++++++++++ .../neil/plaintext/diff_match_patch_test.java | 4 ++++ 2 files changed, 22 insertions(+) diff --git a/java/src/name/fraser/neil/plaintext/diff_match_patch.java b/java/src/name/fraser/neil/plaintext/diff_match_patch.java index 9d07867..c7935ed 100644 --- a/java/src/name/fraser/neil/plaintext/diff_match_patch.java +++ b/java/src/name/fraser/neil/plaintext/diff_match_patch.java @@ -19,6 +19,7 @@ package name.fraser.neil.plaintext; import java.io.UnsupportedEncodingException; +import java.lang.Character; import java.net.URLDecoder; import java.net.URLEncoder; import java.util.*; @@ -1429,7 +1430,24 @@ public int diff_levenshtein(List diffs) { */ public String diff_toDelta(List diffs) { StringBuilder text = new StringBuilder(); + char lastEnd = 0; + boolean isFirst = true; for (Diff aDiff : diffs) { + + char thisTop = aDiff.text.charAt(0); + char thisEnd = aDiff.text.charAt(aDiff.text.length() - 1); + + if (Character.isHighSurrogate(thisEnd)) { + aDiff.text = aDiff.text.substring(0, aDiff.text.length() - 1); + } + + if (! isFirst && Character.isHighSurrogate(lastEnd) && Character.isLowSurrogate(thisTop)) { + aDiff.text = lastEnd + aDiff.text; + } + + isFirst = false; + lastEnd = thisEnd; + switch (aDiff.operation) { case INSERT: try { diff --git a/java/tests/name/fraser/neil/plaintext/diff_match_patch_test.java b/java/tests/name/fraser/neil/plaintext/diff_match_patch_test.java index 2f38793..aef98ce 100644 --- a/java/tests/name/fraser/neil/plaintext/diff_match_patch_test.java +++ b/java/tests/name/fraser/neil/plaintext/diff_match_patch_test.java @@ -424,6 +424,10 @@ public static void testDiffDelta() { assertEquals("diff_fromDelta: Unicode.", diffs, dmp.diff_fromDelta(text1, delta)); + diffs = diffList(new Diff(EQUAL, "\ud83d\ude4b\ud83d"), new Diff(INSERT, "\ude4c\ud83d"), new Diff(EQUAL, "\ude4b")); + delta = dmp.diff_toDelta(diffs); + assertEquals("diff_toDelta: Surrogate Pairs.", "=2\t+%F0%9F%99%8C\t=2", delta); + // Verify pool of unchanged characters. diffs = diffList(new Diff(INSERT, "A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # ")); String text2 = dmp.diff_text2(diffs); From 8d2a5f8a8cd4684228eaf5df28654492cb2e7902 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Sat, 9 Nov 2019 22:54:25 -0600 Subject: [PATCH 03/34] Add fixes for ObjectiveC --- objectivec/DiffMatchPatch.m | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/objectivec/DiffMatchPatch.m b/objectivec/DiffMatchPatch.m index 580f265..8adb5dc 100755 --- a/objectivec/DiffMatchPatch.m +++ b/objectivec/DiffMatchPatch.m @@ -1299,7 +1299,22 @@ - (NSString *)diff_text2:(NSMutableArray *)diffs; - (NSString *)diff_toDelta:(NSMutableArray *)diffs; { NSMutableString *delta = [NSMutableString string]; + UniChar lastEnd; for (Diff *aDiff in diffs) { + + UniChar thisTop = [aDiff.text characterAtIndex:0]; + UniChar thisEnd = [aDiff.text characterAtIndex:([aDiff.text length]-1)]; + + if (CFStringIsSurrogateHighCharacter(thisEnd)) { + aDiff.text = [aDiff.text substringToIndex:([aDiff.text length] - 1)]; + } + + if (nil != lastEnd && CFStringIsSurrogateHighCharacter(lastEnd) && CFStringIsSurrogateLowCharacter(thisTop)) { + aDiff.text = [[NSString stringWithFormat:@"%C", lastEnd] stringByAppendingString:aDiff.text]; + } + + lastEnd = thisEnd; + switch (aDiff.operation) { case DIFF_INSERT: [delta appendFormat:@"+%@\t", [[aDiff.text diff_stringByAddingPercentEscapesForEncodeUriCompatibility] From 535e29e8447ec303eb7350db6ceb88333c5ab2f5 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Sun, 10 Nov 2019 00:28:26 -0600 Subject: [PATCH 04/34] Add fixes for Python2 --- python2/diff_match_patch.py | 25 +++++++++++++++++++++++++ python2/tests/diff_match_patch_test.py | 4 ++++ 2 files changed, 29 insertions(+) diff --git a/python2/diff_match_patch.py b/python2/diff_match_patch.py index 806fe1e..0d0e8a8 100644 --- a/python2/diff_match_patch.py +++ b/python2/diff_match_patch.py @@ -28,6 +28,7 @@ __author__ = 'fraser@google.com (Neil Fraser)' import re +import struct import sys import time import urllib @@ -1135,6 +1136,16 @@ def diff_levenshtein(self, diffs): levenshtein += max(insertions, deletions) return levenshtein + @classmethod + def is_high_surrogate(cls, utf16be_bytes): + c = struct.unpack('>H', utf16be_bytes)[0] + return c >= 0xd800 and c <= 0xdbff + + @classmethod + def is_low_surrogate(cls, utf16be_bytes): + c = struct.unpack('>H', utf16be_bytes)[0] + return c >= 0xdc00 and c <= 0xdfff + def diff_toDelta(self, diffs): """Crush the diff into an encoded string which describes the operations required to transform text1 into text2. @@ -1148,7 +1159,21 @@ def diff_toDelta(self, diffs): Delta text. """ text = [] + last_end = None for (op, data) in diffs: + encoded = data.encode('utf-16be') + this_top = encoded[0:2] + this_end = encoded[-2:] + + if self.is_high_surrogate(this_end): + encoded = encoded[0:-2] + + if last_end and self.is_high_surrogate(last_end) and self.is_low_surrogate(this_top): + encoded = last_end + encoded + + data = encoded.decode('utf-16be') + last_end = this_end + if op == self.DIFF_INSERT: # High ascii will raise UnicodeDecodeError. Use Unicode instead. data = data.encode("utf-8") diff --git a/python2/tests/diff_match_patch_test.py b/python2/tests/diff_match_patch_test.py index 661a6b6..fc633bc 100644 --- a/python2/tests/diff_match_patch_test.py +++ b/python2/tests/diff_match_patch_test.py @@ -441,6 +441,10 @@ def testDiffDelta(self): # Convert delta string into a diff. self.assertEquals(diffs, self.dmp.diff_fromDelta(text1, delta)) + diffs = [(self.dmp.DIFF_EQUAL, u"\ud83d\ude4b\ud83d"), (self.dmp.DIFF_INSERT, u"\ude4c\ud83d"), (self.dmp.DIFF_EQUAL, u"\ude4b")] + delta = self.dmp.diff_toDelta(diffs) + self.assertEquals("=2\t+%F0%9F%99%8C\t=2", delta) + # Verify pool of unchanged characters. diffs = [(self.dmp.DIFF_INSERT, "A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # ")] text2 = self.dmp.diff_text2(diffs) From aafbd58d356a1b783e759ec1895c2ffbcc471cb3 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Sun, 10 Nov 2019 01:05:47 -0600 Subject: [PATCH 05/34] Add fixes for Python3 --- python3/diff_match_patch.py | 24 ++++++++++++++++++++++++ python3/tests/diff_match_patch_test.py | 5 +++++ 2 files changed, 29 insertions(+) diff --git a/python3/diff_match_patch.py b/python3/diff_match_patch.py index cc7f590..e5c52e4 100644 --- a/python3/diff_match_patch.py +++ b/python3/diff_match_patch.py @@ -26,6 +26,7 @@ __author__ = 'fraser@google.com (Neil Fraser)' import re +import struct import sys import time import urllib.parse @@ -1133,6 +1134,16 @@ def diff_levenshtein(self, diffs): levenshtein += max(insertions, deletions) return levenshtein + @classmethod + def is_high_surrogate(cls, utf16be_bytes): + c = struct.unpack('>H', utf16be_bytes)[0] + return c >= 0xd800 and c <= 0xdbff + + @classmethod + def is_low_surrogate(cls, utf16be_bytes): + c = struct.unpack('>H', utf16be_bytes)[0] + return c >= 0xdc00 and c <= 0xdfff + def diff_toDelta(self, diffs): """Crush the diff into an encoded string which describes the operations required to transform text1 into text2. @@ -1146,7 +1157,20 @@ def diff_toDelta(self, diffs): Delta text. """ text = [] + last_end = None for (op, data) in diffs: + encoded = data.encode('utf-16be', 'surrogatepass') + this_top = encoded[0:2] + this_end = encoded[-2:] + + if self.is_high_surrogate(this_end): + encoded = encoded[0:-2] + + if last_end and self.is_high_surrogate(last_end) and self.is_low_surrogate(this_top): + encoded = last_end + encoded + + data = encoded.decode('utf-16be', 'surrogateescape') + last_end = this_end if op == self.DIFF_INSERT: # High ascii will raise UnicodeDecodeError. Use Unicode instead. data = data.encode("utf-8") diff --git a/python3/tests/diff_match_patch_test.py b/python3/tests/diff_match_patch_test.py index 3659d3e..6fa69ad 100644 --- a/python3/tests/diff_match_patch_test.py +++ b/python3/tests/diff_match_patch_test.py @@ -18,6 +18,7 @@ """ import imp +import json import os import sys import time @@ -444,6 +445,10 @@ def testDiffDelta(self): # Convert delta string into a diff. self.assertEqual(diffs, self.dmp.diff_fromDelta(text1, delta)) + diffs = [(self.dmp.DIFF_EQUAL, "\ud83d\ude4b\ud83d"), (self.dmp.DIFF_INSERT, "\ude4c\ud83d"), (self.dmp.DIFF_EQUAL, "\ude4b")] + delta = self.dmp.diff_toDelta(diffs) + self.assertEquals("=2\t+%F0%9F%99%8C\t=2", delta) + # Verify pool of unchanged characters. diffs = [(self.dmp.DIFF_INSERT, "A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # ")] text2 = self.dmp.diff_text2(diffs) From 4fc00732de199b38b5e1ed18e8c465a75a2772c2 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Sun, 10 Nov 2019 01:20:34 -0600 Subject: [PATCH 06/34] Add fixes for Python3 and fix counter in Python2 --- python2/diff_match_patch.py | 4 ++-- python3/diff_match_patch.py | 4 ++-- python3/tests/diff_match_patch_test.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python2/diff_match_patch.py b/python2/diff_match_patch.py index 0d0e8a8..d859753 100644 --- a/python2/diff_match_patch.py +++ b/python2/diff_match_patch.py @@ -1179,9 +1179,9 @@ def diff_toDelta(self, diffs): data = data.encode("utf-8") text.append("+" + urllib.quote(data, "!~*'();/?:@&=+$,# ")) elif op == self.DIFF_DELETE: - text.append("-%d" % len(data)) + text.append("-%d" % (len(data.encode('utf-16be')) // 2)) elif op == self.DIFF_EQUAL: - text.append("=%d" % len(data)) + text.append("=%d" % (len(data.encode('utf-16be')) // 2)) return "\t".join(text) def diff_fromDelta(self, text1, delta): diff --git a/python3/diff_match_patch.py b/python3/diff_match_patch.py index e5c52e4..5253c8b 100644 --- a/python3/diff_match_patch.py +++ b/python3/diff_match_patch.py @@ -1176,9 +1176,9 @@ def diff_toDelta(self, diffs): data = data.encode("utf-8") text.append("+" + urllib.parse.quote(data, "!~*'();/?:@&=+$,# ")) elif op == self.DIFF_DELETE: - text.append("-%d" % len(data)) + text.append("-%d" % (len(data.encode('utf-16-be')) // 2)) elif op == self.DIFF_EQUAL: - text.append("=%d" % len(data)) + text.append("=%d" % (len(data.encode('utf-16-be')) // 2)) return "\t".join(text) def diff_fromDelta(self, text1, delta): diff --git a/python3/tests/diff_match_patch_test.py b/python3/tests/diff_match_patch_test.py index 6fa69ad..e8bae30 100644 --- a/python3/tests/diff_match_patch_test.py +++ b/python3/tests/diff_match_patch_test.py @@ -447,7 +447,7 @@ def testDiffDelta(self): diffs = [(self.dmp.DIFF_EQUAL, "\ud83d\ude4b\ud83d"), (self.dmp.DIFF_INSERT, "\ude4c\ud83d"), (self.dmp.DIFF_EQUAL, "\ude4b")] delta = self.dmp.diff_toDelta(diffs) - self.assertEquals("=2\t+%F0%9F%99%8C\t=2", delta) + self.assertEqual("=2\t+%F0%9F%99%8C\t=2", delta) # Verify pool of unchanged characters. diffs = [(self.dmp.DIFF_INSERT, "A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # ")] From d0a578f152bc7099bc2fd664be49f16b58a97d96 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Sun, 10 Nov 2019 10:56:35 -0600 Subject: [PATCH 07/34] Adjust Python3 code I'm not sure that I made the right assumptions about Python3's Unicode handling when I made the first patch to it. By constructing the specific `diffs` output I created a sequence of code units that `diff_main` in Python3 would _not_ have made because it's operating on Unicode code points natively when finding the common prefix. Therefore I do not think that the Python3 library experienced this problem as the others did. Nonetheless it _has_ been reporting the diff length differently than in other languages and I have left that change in there. Of note, it doesn't look like we have true harmony between the languages despite the appearance of such. The `lua` wiki page makes this clear, but at least with Python we have the ability to harmonize the meaning of the lengths and I have done that in this change. --- python3/diff_match_patch.py | 22 ---------------------- python3/tests/diff_match_patch_test.py | 2 +- 2 files changed, 1 insertion(+), 23 deletions(-) diff --git a/python3/diff_match_patch.py b/python3/diff_match_patch.py index 5253c8b..aa0fc90 100644 --- a/python3/diff_match_patch.py +++ b/python3/diff_match_patch.py @@ -1134,16 +1134,6 @@ def diff_levenshtein(self, diffs): levenshtein += max(insertions, deletions) return levenshtein - @classmethod - def is_high_surrogate(cls, utf16be_bytes): - c = struct.unpack('>H', utf16be_bytes)[0] - return c >= 0xd800 and c <= 0xdbff - - @classmethod - def is_low_surrogate(cls, utf16be_bytes): - c = struct.unpack('>H', utf16be_bytes)[0] - return c >= 0xdc00 and c <= 0xdfff - def diff_toDelta(self, diffs): """Crush the diff into an encoded string which describes the operations required to transform text1 into text2. @@ -1159,18 +1149,6 @@ def diff_toDelta(self, diffs): text = [] last_end = None for (op, data) in diffs: - encoded = data.encode('utf-16be', 'surrogatepass') - this_top = encoded[0:2] - this_end = encoded[-2:] - - if self.is_high_surrogate(this_end): - encoded = encoded[0:-2] - - if last_end and self.is_high_surrogate(last_end) and self.is_low_surrogate(this_top): - encoded = last_end + encoded - - data = encoded.decode('utf-16be', 'surrogateescape') - last_end = this_end if op == self.DIFF_INSERT: # High ascii will raise UnicodeDecodeError. Use Unicode instead. data = data.encode("utf-8") diff --git a/python3/tests/diff_match_patch_test.py b/python3/tests/diff_match_patch_test.py index e8bae30..f88f003 100644 --- a/python3/tests/diff_match_patch_test.py +++ b/python3/tests/diff_match_patch_test.py @@ -445,7 +445,7 @@ def testDiffDelta(self): # Convert delta string into a diff. self.assertEqual(diffs, self.dmp.diff_fromDelta(text1, delta)) - diffs = [(self.dmp.DIFF_EQUAL, "\ud83d\ude4b\ud83d"), (self.dmp.DIFF_INSERT, "\ude4c\ud83d"), (self.dmp.DIFF_EQUAL, "\ude4b")] + diffs = self.dmp.diff_main("\U0001F64B\U0001F64B", "\U0001F64B\U0001F64C\U0001F64B") delta = self.dmp.diff_toDelta(diffs) self.assertEqual("=2\t+%F0%9F%99%8C\t=2", delta) From 08de57e85ba1807e678c800314945759550bcc09 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Sun, 10 Nov 2019 12:20:40 -0600 Subject: [PATCH 08/34] Fix reconstructing diff in Python3 --- python3/diff_match_patch.py | 11 ++++++----- python3/tests/diff_match_patch_test.py | 2 ++ 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/python3/diff_match_patch.py b/python3/diff_match_patch.py index aa0fc90..9fb3e26 100644 --- a/python3/diff_match_patch.py +++ b/python3/diff_match_patch.py @@ -1174,7 +1174,8 @@ def diff_fromDelta(self, text1, delta): ValueError: If invalid input. """ diffs = [] - pointer = 0 # Cursor in text1 + as_utf16 = text1.encode('utf-16-be') + pointer = 0 # Cursor in as_utf16 tokens = delta.split("\t") for token in tokens: if token == "": @@ -1193,8 +1194,8 @@ def diff_fromDelta(self, text1, delta): raise ValueError("Invalid number in diff_fromDelta: " + param) if n < 0: raise ValueError("Negative number in diff_fromDelta: " + param) - text = text1[pointer : pointer + n] - pointer += n + text = as_utf16[pointer : pointer + n * 2].decode('utf-16-be') + pointer += n * 2 if token[0] == "=": diffs.append((self.DIFF_EQUAL, text)) else: @@ -1203,10 +1204,10 @@ def diff_fromDelta(self, text1, delta): # Anything else is an error. raise ValueError("Invalid diff operation in diff_fromDelta: " + token[0]) - if pointer != len(text1): + if pointer != len(as_utf16): raise ValueError( "Delta length (%d) does not equal source text length (%d)." % - (pointer, len(text1))) + (pointer, len(as_utf16))) return diffs # MATCH FUNCTIONS diff --git a/python3/tests/diff_match_patch_test.py b/python3/tests/diff_match_patch_test.py index f88f003..9474762 100644 --- a/python3/tests/diff_match_patch_test.py +++ b/python3/tests/diff_match_patch_test.py @@ -449,6 +449,8 @@ def testDiffDelta(self): delta = self.dmp.diff_toDelta(diffs) self.assertEqual("=2\t+%F0%9F%99%8C\t=2", delta) + self.assertEqual(diffs, self.dmp.diff_fromDelta("\U0001F64B\U0001F64B", "=2\t+%F0%9F%99%8C\t=2")) + # Verify pool of unchanged characters. diffs = [(self.dmp.DIFF_INSERT, "A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # ")] text2 = self.dmp.diff_text2(diffs) From df810f72007c3da7fda2570da8ab18f206eeff6b Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 11 Nov 2019 16:20:38 -0600 Subject: [PATCH 09/34] Updates from review feedback --- .../name/fraser/neil/plaintext/diff_match_patch.java | 3 +++ javascript/diff_match_patch_uncompressed.js | 3 +++ objectivec/DiffMatchPatch.m | 9 ++++++--- objectivec/Tests/DiffMatchPatchTest.m | 12 ++++++++++++ python2/diff_match_patch.py | 2 ++ python3/diff_match_patch.py | 1 - 6 files changed, 26 insertions(+), 4 deletions(-) diff --git a/java/src/name/fraser/neil/plaintext/diff_match_patch.java b/java/src/name/fraser/neil/plaintext/diff_match_patch.java index c7935ed..d5ecb2c 100644 --- a/java/src/name/fraser/neil/plaintext/diff_match_patch.java +++ b/java/src/name/fraser/neil/plaintext/diff_match_patch.java @@ -1447,6 +1447,9 @@ public String diff_toDelta(List diffs) { isFirst = false; lastEnd = thisEnd; + if ( aDiff.text.isEmpty() ) { + continue; + } switch (aDiff.operation) { case INSERT: diff --git a/javascript/diff_match_patch_uncompressed.js b/javascript/diff_match_patch_uncompressed.js index e8bb278..84a78e8 100644 --- a/javascript/diff_match_patch_uncompressed.js +++ b/javascript/diff_match_patch_uncompressed.js @@ -1375,6 +1375,9 @@ diff_match_patch.prototype.diff_toDelta = function(diffs) { } lastEnd = thisEnd; + if ( 0 === thisDiff[1].length ) { + continue; + } switch (diffs[x][0]) { case DIFF_INSERT: diff --git a/objectivec/DiffMatchPatch.m b/objectivec/DiffMatchPatch.m index 8adb5dc..75efae0 100755 --- a/objectivec/DiffMatchPatch.m +++ b/objectivec/DiffMatchPatch.m @@ -1299,7 +1299,7 @@ - (NSString *)diff_text2:(NSMutableArray *)diffs; - (NSString *)diff_toDelta:(NSMutableArray *)diffs; { NSMutableString *delta = [NSMutableString string]; - UniChar lastEnd; + UniChar lastEnd = 0; for (Diff *aDiff in diffs) { UniChar thisTop = [aDiff.text characterAtIndex:0]; @@ -1309,11 +1309,14 @@ - (NSString *)diff_toDelta:(NSMutableArray *)diffs; aDiff.text = [aDiff.text substringToIndex:([aDiff.text length] - 1)]; } - if (nil != lastEnd && CFStringIsSurrogateHighCharacter(lastEnd) && CFStringIsSurrogateLowCharacter(thisTop)) { - aDiff.text = [[NSString stringWithFormat:@"%C", lastEnd] stringByAppendingString:aDiff.text]; + if (0 != lastEnd && CFStringIsSurrogateHighCharacter(lastEnd) && CFStringIsSurrogateLowCharacter(thisTop)) { + aDiff.text = [NSString stringWithFormat:@"%C%@", lastEnd, stringByAppendingString:aDiff.text]; } lastEnd = thisEnd; + if (0 == [aDiff.text length]) { + continue; + } switch (aDiff.operation) { case DIFF_INSERT: diff --git a/objectivec/Tests/DiffMatchPatchTest.m b/objectivec/Tests/DiffMatchPatchTest.m index 9697b04..807ae82 100755 --- a/objectivec/Tests/DiffMatchPatchTest.m +++ b/objectivec/Tests/DiffMatchPatchTest.m @@ -752,6 +752,18 @@ - (void)test_diff_deltaTest { XCTAssertEqualObjects(diffs, [dmp diff_fromDeltaWithText:text1 andDelta:delta error:NULL], @"diff_fromDelta: Unicode 2."); + diffs = [dmp diff_mainOfOldString:@"β˜ΊοΈπŸ––πŸΏ" andNewString:@"β˜ΊοΈπŸ˜ƒπŸ––πŸΏ"]; + delta = [dmp diff_toDelta:diffs]; + + XCTAssertEqualObjects(delta, @"=2\t+%F0%9F%98%83\t=4", @"Delta should match the expected string"); + + diffs = [dmp diff_mainOfOldString:@"β˜ΊοΈπŸ––πŸΏ" andNewString:@"β˜ΊοΈπŸ˜ƒπŸ––πŸΏ"]; + patches = [dmp patch_makeFromDiffs:diffs]; + expectedResult = [dmp patch_apply:patches toString:@"β˜ΊοΈπŸ––πŸΏ"]; + + expectedString = [result firstObject]; + XCTAssertEqualObjects(edited, expectedString, @"Output String should match the Edited one!"); + // Verify pool of unchanged characters. diffs = [NSMutableArray arrayWithObject: [Diff diffWithOperation:DIFF_INSERT andText:@"A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # "]]; diff --git a/python2/diff_match_patch.py b/python2/diff_match_patch.py index d859753..71cbd19 100644 --- a/python2/diff_match_patch.py +++ b/python2/diff_match_patch.py @@ -1173,6 +1173,8 @@ def diff_toDelta(self, diffs): data = encoded.decode('utf-16be') last_end = this_end + if 0 == len(encoded): + continue if op == self.DIFF_INSERT: # High ascii will raise UnicodeDecodeError. Use Unicode instead. diff --git a/python3/diff_match_patch.py b/python3/diff_match_patch.py index 9fb3e26..99aa853 100644 --- a/python3/diff_match_patch.py +++ b/python3/diff_match_patch.py @@ -1147,7 +1147,6 @@ def diff_toDelta(self, diffs): Delta text. """ text = [] - last_end = None for (op, data) in diffs: if op == self.DIFF_INSERT: # High ascii will raise UnicodeDecodeError. Use Unicode instead. From 1d45818a42eb6dd3b3471f2a1d9943fbbc173fcc Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 11 Nov 2019 16:39:39 -0600 Subject: [PATCH 10/34] fixup! Updates from review feedback --- objectivec/DiffMatchPatch.m | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/objectivec/DiffMatchPatch.m b/objectivec/DiffMatchPatch.m index 75efae0..495a07b 100755 --- a/objectivec/DiffMatchPatch.m +++ b/objectivec/DiffMatchPatch.m @@ -1310,7 +1310,7 @@ - (NSString *)diff_toDelta:(NSMutableArray *)diffs; } if (0 != lastEnd && CFStringIsSurrogateHighCharacter(lastEnd) && CFStringIsSurrogateLowCharacter(thisTop)) { - aDiff.text = [NSString stringWithFormat:@"%C%@", lastEnd, stringByAppendingString:aDiff.text]; + aDiff.text = [NSString stringWithFormat:@"%C%@", lastEnd, aDiff.text]; } lastEnd = thisEnd; From 82c11111e4c3b99633b7d387fd616db24f503dd0 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 11 Nov 2019 16:42:18 -0600 Subject: [PATCH 11/34] fixup! fixup! Updates from review feedback --- objectivec/Tests/DiffMatchPatchTest.m | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/objectivec/Tests/DiffMatchPatchTest.m b/objectivec/Tests/DiffMatchPatchTest.m index 807ae82..2869492 100755 --- a/objectivec/Tests/DiffMatchPatchTest.m +++ b/objectivec/Tests/DiffMatchPatchTest.m @@ -758,11 +758,11 @@ - (void)test_diff_deltaTest { XCTAssertEqualObjects(delta, @"=2\t+%F0%9F%98%83\t=4", @"Delta should match the expected string"); diffs = [dmp diff_mainOfOldString:@"β˜ΊοΈπŸ––πŸΏ" andNewString:@"β˜ΊοΈπŸ˜ƒπŸ––πŸΏ"]; - patches = [dmp patch_makeFromDiffs:diffs]; - expectedResult = [dmp patch_apply:patches toString:@"β˜ΊοΈπŸ––πŸΏ"]; + NSArray *patches = [dmp patch_makeFromDiffs:diffs]; + NSArray *patchResult = [dmp patch_apply:patches toString:@"β˜ΊοΈπŸ––πŸΏ"]; - expectedString = [result firstObject]; - XCTAssertEqualObjects(edited, expectedString, @"Output String should match the Edited one!"); + expectedString = [patchResult firstObject]; + XCTAssertEqualObjects(@"β˜ΊοΈπŸ˜ƒπŸ––πŸΏ", expectedString, @"Output String should match the Edited one!"); // Verify pool of unchanged characters. diffs = [NSMutableArray arrayWithObject: From ea303f28cbf41dd7ec27cbf1183c6bdf0ee5d84c Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 11 Nov 2019 17:06:52 -0600 Subject: [PATCH 12/34] Rebuild compressed JavaScript with Closure Compiler ```bash java \ -jar path/to/closure-compiler-v20191027.jar \ --js_output_file=diff_match_patch.js \ diff_match_patch_uncompressed.js ``` --- javascript/diff_match_patch.js | 102 ++++++++++++++++----------------- 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/javascript/diff_match_patch.js b/javascript/diff_match_patch.js index 2fe320a..7fce1db 100644 --- a/javascript/diff_match_patch.js +++ b/javascript/diff_match_patch.js @@ -1,55 +1,55 @@ var diff_match_patch=function(){this.Diff_Timeout=1;this.Diff_EditCost=4;this.Match_Threshold=.5;this.Match_Distance=1E3;this.Patch_DeleteThreshold=.5;this.Patch_Margin=4;this.Match_MaxBits=32},DIFF_DELETE=-1,DIFF_INSERT=1,DIFF_EQUAL=0;diff_match_patch.Diff=function(a,b){this[0]=a;this[1]=b};diff_match_patch.Diff.prototype.length=2;diff_match_patch.Diff.prototype.toString=function(){return this[0]+","+this[1]}; -diff_match_patch.prototype.diff_main=function(a,b,c,d){"undefined"==typeof d&&(d=0>=this.Diff_Timeout?Number.MAX_VALUE:(new Date).getTime()+1E3*this.Diff_Timeout);if(null==a||null==b)throw Error("Null input. (diff_main)");if(a==b)return a?[new diff_match_patch.Diff(DIFF_EQUAL,a)]:[];"undefined"==typeof c&&(c=!0);var e=c,f=this.diff_commonPrefix(a,b);c=a.substring(0,f);a=a.substring(f);b=b.substring(f);f=this.diff_commonSuffix(a,b);var g=a.substring(a.length-f);a=a.substring(0,a.length-f);b=b.substring(0, -b.length-f);a=this.diff_compute_(a,b,e,d);c&&a.unshift(new diff_match_patch.Diff(DIFF_EQUAL,c));g&&a.push(new diff_match_patch.Diff(DIFF_EQUAL,g));this.diff_cleanupMerge(a);return a}; -diff_match_patch.prototype.diff_compute_=function(a,b,c,d){if(!a)return[new diff_match_patch.Diff(DIFF_INSERT,b)];if(!b)return[new diff_match_patch.Diff(DIFF_DELETE,a)];var e=a.length>b.length?a:b,f=a.length>b.length?b:a,g=e.indexOf(f);return-1!=g?(c=[new diff_match_patch.Diff(DIFF_INSERT,e.substring(0,g)),new diff_match_patch.Diff(DIFF_EQUAL,f),new diff_match_patch.Diff(DIFF_INSERT,e.substring(g+f.length))],a.length>b.length&&(c[0][0]=c[2][0]=DIFF_DELETE),c):1==f.length?[new diff_match_patch.Diff(DIFF_DELETE, -a),new diff_match_patch.Diff(DIFF_INSERT,b)]:(e=this.diff_halfMatch_(a,b))?(b=e[1],f=e[3],a=e[4],e=this.diff_main(e[0],e[2],c,d),c=this.diff_main(b,f,c,d),e.concat([new diff_match_patch.Diff(DIFF_EQUAL,a)],c)):c&&100c);t++){for(var v=-t+p;v<=t-x;v+=2){var n=f+v;var r=v==-t||v!=t&&h[n-1]d)x+=2;else if(y>e)p+=2;else if(m&&(n=f+k-v,0<=n&&n= -u)return this.diff_bisectSplit_(a,b,r,y,c)}}for(v=-t+w;v<=t-q;v+=2){n=f+v;u=v==-t||v!=t&&l[n-1]d)q+=2;else if(r>e)w+=2;else if(!m&&(n=f+k-v,0<=n&&n=u)))return this.diff_bisectSplit_(a,b,r,y,c)}}return[new diff_match_patch.Diff(DIFF_DELETE,a),new diff_match_patch.Diff(DIFF_INSERT,b)]}; -diff_match_patch.prototype.diff_bisectSplit_=function(a,b,c,d,e){var f=a.substring(0,c),g=b.substring(0,d);a=a.substring(c);b=b.substring(d);f=this.diff_main(f,g,!1,e);e=this.diff_main(a,b,!1,e);return f.concat(e)}; -diff_match_patch.prototype.diff_linesToChars_=function(a,b){function c(a){for(var b="",c=0,g=-1,h=d.length;gd?a=a.substring(c-d):c=a.length?[h,k,l,m,g]:null}if(0>=this.Diff_Timeout)return null; -var d=a.length>b.length?a:b,e=a.length>b.length?b:a;if(4>d.length||2*e.lengthd[4].length?g:d:d:g;else return null;if(a.length>b.length){d=g[0];e=g[1];var h=g[2];var l=g[3]}else h=g[0],l=g[1],d=g[2],e=g[3];return[d,e,h,l,g[4]]}; -diff_match_patch.prototype.diff_cleanupSemantic=function(a){for(var b=!1,c=[],d=0,e=null,f=0,g=0,h=0,l=0,k=0;f=e){if(d>=b.length/2||d>=c.length/2)a.splice(f,0,new diff_match_patch.Diff(DIFF_EQUAL,c.substring(0,d))),a[f-1][1]=b.substring(0,b.length-d),a[f+1][1]=c.substring(d),f++}else if(e>=b.length/2||e>=c.length/2)a.splice(f,0,new diff_match_patch.Diff(DIFF_EQUAL,b.substring(0,e))),a[f-1][0]=DIFF_INSERT,a[f-1][1]=c.substring(0,c.length-e),a[f+1][0]=DIFF_DELETE, +diff_match_patch.prototype.diff_main=function(a,b,d,c){"undefined"==typeof c&&(c=0>=this.Diff_Timeout?Number.MAX_VALUE:(new Date).getTime()+1E3*this.Diff_Timeout);if(null==a||null==b)throw Error("Null input. (diff_main)");if(a==b)return a?[new diff_match_patch.Diff(DIFF_EQUAL,a)]:[];"undefined"==typeof d&&(d=!0);var e=d,f=this.diff_commonPrefix(a,b);d=a.substring(0,f);a=a.substring(f);b=b.substring(f);f=this.diff_commonSuffix(a,b);var g=a.substring(a.length-f);a=a.substring(0,a.length-f);b=b.substring(0, +b.length-f);a=this.diff_compute_(a,b,e,c);d&&a.unshift(new diff_match_patch.Diff(DIFF_EQUAL,d));g&&a.push(new diff_match_patch.Diff(DIFF_EQUAL,g));this.diff_cleanupMerge(a);return a}; +diff_match_patch.prototype.diff_compute_=function(a,b,d,c){if(!a)return[new diff_match_patch.Diff(DIFF_INSERT,b)];if(!b)return[new diff_match_patch.Diff(DIFF_DELETE,a)];var e=a.length>b.length?a:b,f=a.length>b.length?b:a,g=e.indexOf(f);return-1!=g?(d=[new diff_match_patch.Diff(DIFF_INSERT,e.substring(0,g)),new diff_match_patch.Diff(DIFF_EQUAL,f),new diff_match_patch.Diff(DIFF_INSERT,e.substring(g+f.length))],a.length>b.length&&(d[0][0]=d[2][0]=DIFF_DELETE),d):1==f.length?[new diff_match_patch.Diff(DIFF_DELETE, +a),new diff_match_patch.Diff(DIFF_INSERT,b)]:(e=this.diff_halfMatch_(a,b))?(b=e[1],f=e[3],a=e[4],e=this.diff_main(e[0],e[2],d,c),d=this.diff_main(b,f,d,c),e.concat([new diff_match_patch.Diff(DIFF_EQUAL,a)],d)):d&&100d);u++){for(var r=-u+p;r<=u-x;r+=2){var n=f+r;var q=r==-u||r!=u&&h[n-1]c)x+=2;else if(y>e)p+=2;else if(m&&(n=f+k-r,0<=n&&n= +v)return this.diff_bisectSplit_(a,b,q,y,d)}}for(r=-u+w;r<=u-t;r+=2){n=f+r;v=r==-u||r!=u&&l[n-1]c)t+=2;else if(q>e)w+=2;else if(!m&&(n=f+k-r,0<=n&&n=v)))return this.diff_bisectSplit_(a,b,q,y,d)}}return[new diff_match_patch.Diff(DIFF_DELETE,a),new diff_match_patch.Diff(DIFF_INSERT,b)]}; +diff_match_patch.prototype.diff_bisectSplit_=function(a,b,d,c,e){var f=a.substring(0,d),g=b.substring(0,c);a=a.substring(d);b=b.substring(c);f=this.diff_main(f,g,!1,e);e=this.diff_main(a,b,!1,e);return f.concat(e)}; +diff_match_patch.prototype.diff_linesToChars_=function(a,b){function d(a){for(var b="",d=0,g=-1,m=c.length;gc?a=a.substring(d-c):d=a.length?[h,k,l,r,g]:null}if(0>=this.Diff_Timeout)return null; +var c=a.length>b.length?a:b,e=a.length>b.length?b:a;if(4>c.length||2*e.lengthc[4].length?g:c:c:g;else return null;a.length>b.length?(a=g[0],b=g[1],c=g[2],e=g[3]):(c=g[0],e=g[1],a=g[2],b=g[3]);return[a,b,c,e,g[4]]}; +diff_match_patch.prototype.diff_cleanupSemantic=function(a){for(var b=!1,d=[],c=0,e=null,f=0,g=0,h=0,l=0,k=0;f=e){if(c>=b.length/2||c>=d.length/2)a.splice(f,0,new diff_match_patch.Diff(DIFF_EQUAL,d.substring(0,c))),a[f-1][1]=b.substring(0,b.length-c),a[f+1][1]=d.substring(c),f++}else if(e>=b.length/2||e>=d.length/2)a.splice(f,0,new diff_match_patch.Diff(DIFF_EQUAL,b.substring(0,e))),a[f-1][0]=DIFF_INSERT,a[f-1][1]=d.substring(0,d.length-e),a[f+1][0]=DIFF_DELETE, a[f+1][1]=b.substring(e),f++;f++}f++}}; -diff_match_patch.prototype.diff_cleanupSemanticLossless=function(a){function b(a,b){if(!a||!b)return 6;var c=a.charAt(a.length-1),d=b.charAt(0),e=c.match(diff_match_patch.nonAlphaNumericRegex_),f=d.match(diff_match_patch.nonAlphaNumericRegex_),g=e&&c.match(diff_match_patch.whitespaceRegex_),h=f&&d.match(diff_match_patch.whitespaceRegex_);c=g&&c.match(diff_match_patch.linebreakRegex_);d=h&&d.match(diff_match_patch.linebreakRegex_);var k=c&&a.match(diff_match_patch.blanklineEndRegex_),l=d&&b.match(diff_match_patch.blanklineStartRegex_); -return k||l?5:c||d?4:e&&!g&&h?3:g||h?2:e||f?1:0}for(var c=1;c=k&&(k=m,g=d,h=e,l=f)}a[c-1][1]!=g&&(g?a[c-1][1]=g:(a.splice(c- -1,1),c--),a[c][1]=h,l?a[c+1][1]=l:(a.splice(c+1,1),c--))}c++}};diff_match_patch.nonAlphaNumericRegex_=/[^a-zA-Z0-9]/;diff_match_patch.whitespaceRegex_=/\s/;diff_match_patch.linebreakRegex_=/[\r\n]/;diff_match_patch.blanklineEndRegex_=/\n\r?\n$/;diff_match_patch.blanklineStartRegex_=/^\r?\n\r?\n/; -diff_match_patch.prototype.diff_cleanupEfficiency=function(a){for(var b=!1,c=[],d=0,e=null,f=0,g=!1,h=!1,l=!1,k=!1;fb)break;e=c;f=d}return a.length!=g&&a[g][0]===DIFF_DELETE?f:f+(b-e)}; -diff_match_patch.prototype.diff_prettyHtml=function(a){for(var b=[],c=/&/g,d=//g,f=/\n/g,g=0;g");switch(h){case DIFF_INSERT:b[g]=''+l+"";break;case DIFF_DELETE:b[g]=''+l+"";break;case DIFF_EQUAL:b[g]=""+l+""}}return b.join("")}; -diff_match_patch.prototype.diff_text1=function(a){for(var b=[],c=0;cthis.Match_MaxBits)throw Error("Pattern too long for this browser.");var e=this.match_alphabet_(b),f=this,g=this.Match_Threshold,h=a.indexOf(b,c);-1!=h&&(g=Math.min(d(0,h),g),h=a.lastIndexOf(b,c+b.length),-1!=h&&(g=Math.min(d(0,h),g)));var l=1<=k;q--){var t=e[a.charAt(q-1)];m[q]=0===w?(m[q+1]<<1|1)&t:(m[q+1]<<1|1)&t|(x[q+1]|x[q])<<1|1|x[q+1];if(m[q]&l&&(t=d(w,q-1),t<=g))if(g=t,h=q-1,h>c)k=Math.max(1,2*c-h);else break}if(d(w+1,c)>g)break;x=m}return h}; -diff_match_patch.prototype.match_alphabet_=function(a){for(var b={},c=0;c=2*this.Patch_Margin&&e&&(this.patch_addContext_(a,h),c.push(a),a=new diff_match_patch.patch_obj,e=0,h=d,f=g)}k!==DIFF_INSERT&&(f+=m.length);k!==DIFF_DELETE&&(g+=m.length)}e&&(this.patch_addContext_(a,h),c.push(a));return c}; -diff_match_patch.prototype.patch_deepCopy=function(a){for(var b=[],c=0;cthis.Match_MaxBits){var k=this.match_main(b,h.substring(0,this.Match_MaxBits),g);-1!=k&&(l=this.match_main(b,h.substring(h.length-this.Match_MaxBits),g+h.length-this.Match_MaxBits),-1==l||k>=l)&&(k=-1)}else k=this.match_main(b,h, -g);if(-1==k)e[f]=!1,d-=a[f].length2-a[f].length1;else if(e[f]=!0,d=k-g,g=-1==l?b.substring(k,k+h.length):b.substring(k,l+this.Match_MaxBits),h==g)b=b.substring(0,k)+this.diff_text2(a[f].diffs)+b.substring(k+h.length);else if(g=this.diff_main(h,g,!1),h.length>this.Match_MaxBits&&this.diff_levenshtein(g)/h.length>this.Patch_DeleteThreshold)e[f]=!1;else{this.diff_cleanupSemanticLossless(g);h=0;var m;for(l=0;le[0][1].length){var f=b-e[0][1].length;e[0][1]=c.substring(e[0][1].length)+e[0][1];d.start1-=f;d.start2-=f;d.length1+=f;d.length2+=f}d=a[a.length-1];e=d.diffs; -0==e.length||e[e.length-1][0]!=DIFF_EQUAL?(e.push(new diff_match_patch.Diff(DIFF_EQUAL,c)),d.length1+=b,d.length2+=b):b>e[e.length-1][1].length&&(f=b-e[e.length-1][1].length,e[e.length-1][1]+=c.substring(0,f),d.length1+=f,d.length2+=f);return c}; -diff_match_patch.prototype.patch_splitMax=function(a){for(var b=this.Match_MaxBits,c=0;c2*b?(h.length1+=k.length,e+=k.length,l=!1,h.diffs.push(new diff_match_patch.Diff(g,k)),d.diffs.shift()):(k=k.substring(0,b-h.length1-this.Patch_Margin),h.length1+=k.length,e+=k.length,g===DIFF_EQUAL?(h.length2+=k.length,f+=k.length):l=!1,h.diffs.push(new diff_match_patch.Diff(g,k)),k==d.diffs[0][1]?d.diffs.shift():d.diffs[0][1]=d.diffs[0][1].substring(k.length))}g=this.diff_text2(h.diffs); -g=g.substring(g.length-this.Patch_Margin);k=this.diff_text1(d.diffs).substring(0,this.Patch_Margin);""!==k&&(h.length1+=k.length,h.length2+=k.length,0!==h.diffs.length&&h.diffs[h.diffs.length-1][0]===DIFF_EQUAL?h.diffs[h.diffs.length-1][1]+=k:h.diffs.push(new diff_match_patch.Diff(DIFF_EQUAL,k)));l||a.splice(++c,0,h)}}};diff_match_patch.prototype.patch_toText=function(a){for(var b=[],c=0;c=k&&(k=m,g=c,h=e,l=f)}a[d-1][1]!=g&&(g?a[d-1][1]=g:(a.splice(d- +1,1),d--),a[d][1]=h,l?a[d+1][1]=l:(a.splice(d+1,1),d--))}d++}};diff_match_patch.nonAlphaNumericRegex_=/[^a-zA-Z0-9]/;diff_match_patch.whitespaceRegex_=/\s/;diff_match_patch.linebreakRegex_=/[\r\n]/;diff_match_patch.blanklineEndRegex_=/\n\r?\n$/;diff_match_patch.blanklineStartRegex_=/^\r?\n\r?\n/; +diff_match_patch.prototype.diff_cleanupEfficiency=function(a){for(var b=!1,d=[],c=0,e=null,f=0,g=!1,h=!1,l=!1,k=!1;fb)break;e=d;f=c}return a.length!=g&&a[g][0]===DIFF_DELETE?f:f+(b-e)}; +diff_match_patch.prototype.diff_prettyHtml=function(a){for(var b=[],d=/&/g,c=//g,f=/\n/g,g=0;g");switch(h){case DIFF_INSERT:b[g]=''+l+"";break;case DIFF_DELETE:b[g]=''+l+"";break;case DIFF_EQUAL:b[g]=""+l+""}}return b.join("")}; +diff_match_patch.prototype.diff_text1=function(a){for(var b=[],d=0;d=a};diff_match_patch.prototype.isLowSurrogate=function(a){a=a.charCodeAt(0);return 56320<=a&&57343>=a}; +diff_match_patch.prototype.diff_toDelta=function(a){for(var b=[],d,c=0;cthis.Match_MaxBits)throw Error("Pattern too long for this browser.");var e=this.match_alphabet_(b),f=this,g=this.Match_Threshold,h=a.indexOf(b,d);-1!=h&&(g=Math.min(c(0,h),g),h=a.lastIndexOf(b,d+b.length),-1!=h&&(g=Math.min(c(0,h),g)));var l=1<=k;t--){var u=e[a.charAt(t-1)];m[t]=0===w?(m[t+1]<<1|1)&u:(m[t+1]<<1|1)&u|(x[t+1]|x[t])<<1|1|x[t+1];if(m[t]&l&&(u=c(w,t-1),u<=g))if(g=u,h=t-1,h>d)k=Math.max(1,2*d-h);else break}if(c(w+1,d)>g)break;x=m}return h}; +diff_match_patch.prototype.match_alphabet_=function(a){for(var b={},d=0;d=2*this.Patch_Margin&&e&&(this.patch_addContext_(a,h),d.push(a),a=new diff_match_patch.patch_obj,e=0,h=c,f=g)}k!==DIFF_INSERT&&(f+=m.length);k!==DIFF_DELETE&&(g+=m.length)}e&&(this.patch_addContext_(a,h),d.push(a));return d}; +diff_match_patch.prototype.patch_deepCopy=function(a){for(var b=[],d=0;dthis.Match_MaxBits){var k=this.match_main(b,h.substring(0,this.Match_MaxBits),g);-1!=k&&(l=this.match_main(b,h.substring(h.length-this.Match_MaxBits),g+h.length-this.Match_MaxBits),-1==l||k>=l)&&(k=-1)}else k=this.match_main(b,h, +g);if(-1==k)e[f]=!1,c-=a[f].length2-a[f].length1;else if(e[f]=!0,c=k-g,g=-1==l?b.substring(k,k+h.length):b.substring(k,l+this.Match_MaxBits),h==g)b=b.substring(0,k)+this.diff_text2(a[f].diffs)+b.substring(k+h.length);else if(g=this.diff_main(h,g,!1),h.length>this.Match_MaxBits&&this.diff_levenshtein(g)/h.length>this.Patch_DeleteThreshold)e[f]=!1;else{this.diff_cleanupSemanticLossless(g);h=0;var m;for(l=0;le[0][1].length){var f=b-e[0][1].length;e[0][1]=d.substring(e[0][1].length)+e[0][1];c.start1-=f;c.start2-=f;c.length1+=f;c.length2+=f}c=a[a.length-1];e=c.diffs; +0==e.length||e[e.length-1][0]!=DIFF_EQUAL?(e.push(new diff_match_patch.Diff(DIFF_EQUAL,d)),c.length1+=b,c.length2+=b):b>e[e.length-1][1].length&&(f=b-e[e.length-1][1].length,e[e.length-1][1]+=d.substring(0,f),c.length1+=f,c.length2+=f);return d}; +diff_match_patch.prototype.patch_splitMax=function(a){for(var b=this.Match_MaxBits,d=0;d2*b?(h.length1+=k.length,e+=k.length,l=!1,h.diffs.push(new diff_match_patch.Diff(g,k)),c.diffs.shift()):(k=k.substring(0,b-h.length1-this.Patch_Margin),h.length1+=k.length,e+=k.length,g===DIFF_EQUAL?(h.length2+=k.length,f+=k.length):l=!1,h.diffs.push(new diff_match_patch.Diff(g,k)),k==c.diffs[0][1]?c.diffs.shift():c.diffs[0][1]=c.diffs[0][1].substring(k.length))}g=this.diff_text2(h.diffs); +g=g.substring(g.length-this.Patch_Margin);k=this.diff_text1(c.diffs).substring(0,this.Patch_Margin);""!==k&&(h.length1+=k.length,h.length2+=k.length,0!==h.diffs.length&&h.diffs[h.diffs.length-1][0]===DIFF_EQUAL?h.diffs[h.diffs.length-1][1]+=k:h.diffs.push(new diff_match_patch.Diff(DIFF_EQUAL,k)));l||a.splice(++d,0,h)}}};diff_match_patch.prototype.patch_toText=function(a){for(var b=[],d=0;d Date: Thu, 12 Dec 2019 20:36:02 -0700 Subject: [PATCH 13/34] Handle cases where we delete characters In the previous iteration of this patch we were only properly handling cases where a new surrogate pair was inserted in between two existing pairs whose high surrogates all matched. Unfortunately when swapping characters or performing any edits where we delete a surrogate pair the patch failed because it only carried the trailing high surrogate over to the next group instead of distributing it to any insert _and_ delete groups following an equality group. In this patch I've updated the JavaScript library to properly distribute the trailing high surrogate. --- javascript/diff_match_patch.js | 105 ++++++++++---------- javascript/diff_match_patch_uncompressed.js | 24 +++-- javascript/tests/diff_match_patch_test.js | 39 +++++++- 3 files changed, 109 insertions(+), 59 deletions(-) diff --git a/javascript/diff_match_patch.js b/javascript/diff_match_patch.js index 7fce1db..4ba112a 100644 --- a/javascript/diff_match_patch.js +++ b/javascript/diff_match_patch.js @@ -1,55 +1,56 @@ var diff_match_patch=function(){this.Diff_Timeout=1;this.Diff_EditCost=4;this.Match_Threshold=.5;this.Match_Distance=1E3;this.Patch_DeleteThreshold=.5;this.Patch_Margin=4;this.Match_MaxBits=32},DIFF_DELETE=-1,DIFF_INSERT=1,DIFF_EQUAL=0;diff_match_patch.Diff=function(a,b){this[0]=a;this[1]=b};diff_match_patch.Diff.prototype.length=2;diff_match_patch.Diff.prototype.toString=function(){return this[0]+","+this[1]}; -diff_match_patch.prototype.diff_main=function(a,b,d,c){"undefined"==typeof c&&(c=0>=this.Diff_Timeout?Number.MAX_VALUE:(new Date).getTime()+1E3*this.Diff_Timeout);if(null==a||null==b)throw Error("Null input. (diff_main)");if(a==b)return a?[new diff_match_patch.Diff(DIFF_EQUAL,a)]:[];"undefined"==typeof d&&(d=!0);var e=d,f=this.diff_commonPrefix(a,b);d=a.substring(0,f);a=a.substring(f);b=b.substring(f);f=this.diff_commonSuffix(a,b);var g=a.substring(a.length-f);a=a.substring(0,a.length-f);b=b.substring(0, -b.length-f);a=this.diff_compute_(a,b,e,c);d&&a.unshift(new diff_match_patch.Diff(DIFF_EQUAL,d));g&&a.push(new diff_match_patch.Diff(DIFF_EQUAL,g));this.diff_cleanupMerge(a);return a}; -diff_match_patch.prototype.diff_compute_=function(a,b,d,c){if(!a)return[new diff_match_patch.Diff(DIFF_INSERT,b)];if(!b)return[new diff_match_patch.Diff(DIFF_DELETE,a)];var e=a.length>b.length?a:b,f=a.length>b.length?b:a,g=e.indexOf(f);return-1!=g?(d=[new diff_match_patch.Diff(DIFF_INSERT,e.substring(0,g)),new diff_match_patch.Diff(DIFF_EQUAL,f),new diff_match_patch.Diff(DIFF_INSERT,e.substring(g+f.length))],a.length>b.length&&(d[0][0]=d[2][0]=DIFF_DELETE),d):1==f.length?[new diff_match_patch.Diff(DIFF_DELETE, -a),new diff_match_patch.Diff(DIFF_INSERT,b)]:(e=this.diff_halfMatch_(a,b))?(b=e[1],f=e[3],a=e[4],e=this.diff_main(e[0],e[2],d,c),d=this.diff_main(b,f,d,c),e.concat([new diff_match_patch.Diff(DIFF_EQUAL,a)],d)):d&&100d);u++){for(var r=-u+p;r<=u-x;r+=2){var n=f+r;var q=r==-u||r!=u&&h[n-1]c)x+=2;else if(y>e)p+=2;else if(m&&(n=f+k-r,0<=n&&n= -v)return this.diff_bisectSplit_(a,b,q,y,d)}}for(r=-u+w;r<=u-t;r+=2){n=f+r;v=r==-u||r!=u&&l[n-1]c)t+=2;else if(q>e)w+=2;else if(!m&&(n=f+k-r,0<=n&&n=v)))return this.diff_bisectSplit_(a,b,q,y,d)}}return[new diff_match_patch.Diff(DIFF_DELETE,a),new diff_match_patch.Diff(DIFF_INSERT,b)]}; -diff_match_patch.prototype.diff_bisectSplit_=function(a,b,d,c,e){var f=a.substring(0,d),g=b.substring(0,c);a=a.substring(d);b=b.substring(c);f=this.diff_main(f,g,!1,e);e=this.diff_main(a,b,!1,e);return f.concat(e)}; -diff_match_patch.prototype.diff_linesToChars_=function(a,b){function d(a){for(var b="",d=0,g=-1,m=c.length;gc?a=a.substring(d-c):d=a.length?[h,k,l,r,g]:null}if(0>=this.Diff_Timeout)return null; -var c=a.length>b.length?a:b,e=a.length>b.length?b:a;if(4>c.length||2*e.lengthc[4].length?g:c:c:g;else return null;a.length>b.length?(a=g[0],b=g[1],c=g[2],e=g[3]):(c=g[0],e=g[1],a=g[2],b=g[3]);return[a,b,c,e,g[4]]}; -diff_match_patch.prototype.diff_cleanupSemantic=function(a){for(var b=!1,d=[],c=0,e=null,f=0,g=0,h=0,l=0,k=0;f=e){if(c>=b.length/2||c>=d.length/2)a.splice(f,0,new diff_match_patch.Diff(DIFF_EQUAL,d.substring(0,c))),a[f-1][1]=b.substring(0,b.length-c),a[f+1][1]=d.substring(c),f++}else if(e>=b.length/2||e>=d.length/2)a.splice(f,0,new diff_match_patch.Diff(DIFF_EQUAL,b.substring(0,e))),a[f-1][0]=DIFF_INSERT,a[f-1][1]=d.substring(0,d.length-e),a[f+1][0]=DIFF_DELETE, +diff_match_patch.prototype.diff_main=function(a,b,c,d){"undefined"==typeof d&&(d=0>=this.Diff_Timeout?Number.MAX_VALUE:(new Date).getTime()+1E3*this.Diff_Timeout);if(null==a||null==b)throw Error("Null input. (diff_main)");if(a==b)return a?[new diff_match_patch.Diff(DIFF_EQUAL,a)]:[];"undefined"==typeof c&&(c=!0);var e=c,f=this.diff_commonPrefix(a,b);c=a.substring(0,f);a=a.substring(f);b=b.substring(f);f=this.diff_commonSuffix(a,b);var g=a.substring(a.length-f);a=a.substring(0,a.length-f);b=b.substring(0, +b.length-f);a=this.diff_compute_(a,b,e,d);c&&a.unshift(new diff_match_patch.Diff(DIFF_EQUAL,c));g&&a.push(new diff_match_patch.Diff(DIFF_EQUAL,g));this.diff_cleanupMerge(a);return a}; +diff_match_patch.prototype.diff_compute_=function(a,b,c,d){if(!a)return[new diff_match_patch.Diff(DIFF_INSERT,b)];if(!b)return[new diff_match_patch.Diff(DIFF_DELETE,a)];var e=a.length>b.length?a:b,f=a.length>b.length?b:a,g=e.indexOf(f);return-1!=g?(c=[new diff_match_patch.Diff(DIFF_INSERT,e.substring(0,g)),new diff_match_patch.Diff(DIFF_EQUAL,f),new diff_match_patch.Diff(DIFF_INSERT,e.substring(g+f.length))],a.length>b.length&&(c[0][0]=c[2][0]=DIFF_DELETE),c):1==f.length?[new diff_match_patch.Diff(DIFF_DELETE, +a),new diff_match_patch.Diff(DIFF_INSERT,b)]:(e=this.diff_halfMatch_(a,b))?(b=e[1],f=e[3],a=e[4],e=this.diff_main(e[0],e[2],c,d),c=this.diff_main(b,f,c,d),e.concat([new diff_match_patch.Diff(DIFF_EQUAL,a)],c)):c&&100c);t++){for(var v=-t+p;v<=t-x;v+=2){var n=f+v;var r=v==-t||v!=t&&h[n-1]d)x+=2;else if(y>e)p+=2;else if(m&&(n=f+k-v,0<=n&&n= +u)return this.diff_bisectSplit_(a,b,r,y,c)}}for(v=-t+w;v<=t-q;v+=2){n=f+v;u=v==-t||v!=t&&l[n-1]d)q+=2;else if(r>e)w+=2;else if(!m&&(n=f+k-v,0<=n&&n=u)))return this.diff_bisectSplit_(a,b,r,y,c)}}return[new diff_match_patch.Diff(DIFF_DELETE,a),new diff_match_patch.Diff(DIFF_INSERT,b)]}; +diff_match_patch.prototype.diff_bisectSplit_=function(a,b,c,d,e){var f=a.substring(0,c),g=b.substring(0,d);a=a.substring(c);b=b.substring(d);f=this.diff_main(f,g,!1,e);e=this.diff_main(a,b,!1,e);return f.concat(e)}; +diff_match_patch.prototype.diff_linesToChars_=function(a,b){function c(a){for(var b="",c=0,g=-1,h=d.length;gd?a=a.substring(c-d):c=a.length?[h,k,l,m,g]:null}if(0>=this.Diff_Timeout)return null; +var d=a.length>b.length?a:b,e=a.length>b.length?b:a;if(4>d.length||2*e.lengthd[4].length?g:d:d:g;else return null;if(a.length>b.length){d=g[0];e=g[1];var h=g[2];var l=g[3]}else h=g[0],l=g[1],d=g[2],e=g[3];return[d,e,h,l,g[4]]}; +diff_match_patch.prototype.diff_cleanupSemantic=function(a){for(var b=!1,c=[],d=0,e=null,f=0,g=0,h=0,l=0,k=0;f=e){if(d>=b.length/2||d>=c.length/2)a.splice(f,0,new diff_match_patch.Diff(DIFF_EQUAL,c.substring(0,d))),a[f-1][1]=b.substring(0,b.length-d),a[f+1][1]=c.substring(d),f++}else if(e>=b.length/2||e>=c.length/2)a.splice(f,0,new diff_match_patch.Diff(DIFF_EQUAL,b.substring(0,e))),a[f-1][0]=DIFF_INSERT,a[f-1][1]=c.substring(0,c.length-e),a[f+1][0]=DIFF_DELETE, a[f+1][1]=b.substring(e),f++;f++}f++}}; -diff_match_patch.prototype.diff_cleanupSemanticLossless=function(a){function b(a,b){if(!a||!b)return 6;var c=a.charAt(a.length-1),d=b.charAt(0),e=c.match(diff_match_patch.nonAlphaNumericRegex_),f=d.match(diff_match_patch.nonAlphaNumericRegex_),g=e&&c.match(diff_match_patch.whitespaceRegex_),h=f&&d.match(diff_match_patch.whitespaceRegex_);c=g&&c.match(diff_match_patch.linebreakRegex_);d=h&&d.match(diff_match_patch.linebreakRegex_);a=c&&a.match(diff_match_patch.blanklineEndRegex_);b=d&&b.match(diff_match_patch.blanklineStartRegex_); -return a||b?5:c||d?4:e&&!g&&h?3:g||h?2:e||f?1:0}for(var d=1;d=k&&(k=m,g=c,h=e,l=f)}a[d-1][1]!=g&&(g?a[d-1][1]=g:(a.splice(d- -1,1),d--),a[d][1]=h,l?a[d+1][1]=l:(a.splice(d+1,1),d--))}d++}};diff_match_patch.nonAlphaNumericRegex_=/[^a-zA-Z0-9]/;diff_match_patch.whitespaceRegex_=/\s/;diff_match_patch.linebreakRegex_=/[\r\n]/;diff_match_patch.blanklineEndRegex_=/\n\r?\n$/;diff_match_patch.blanklineStartRegex_=/^\r?\n\r?\n/; -diff_match_patch.prototype.diff_cleanupEfficiency=function(a){for(var b=!1,d=[],c=0,e=null,f=0,g=!1,h=!1,l=!1,k=!1;fb)break;e=d;f=c}return a.length!=g&&a[g][0]===DIFF_DELETE?f:f+(b-e)}; -diff_match_patch.prototype.diff_prettyHtml=function(a){for(var b=[],d=/&/g,c=//g,f=/\n/g,g=0;g");switch(h){case DIFF_INSERT:b[g]=''+l+"";break;case DIFF_DELETE:b[g]=''+l+"";break;case DIFF_EQUAL:b[g]=""+l+""}}return b.join("")}; -diff_match_patch.prototype.diff_text1=function(a){for(var b=[],d=0;d=a};diff_match_patch.prototype.isLowSurrogate=function(a){a=a.charCodeAt(0);return 56320<=a&&57343>=a}; -diff_match_patch.prototype.diff_toDelta=function(a){for(var b=[],d,c=0;cthis.Match_MaxBits)throw Error("Pattern too long for this browser.");var e=this.match_alphabet_(b),f=this,g=this.Match_Threshold,h=a.indexOf(b,d);-1!=h&&(g=Math.min(c(0,h),g),h=a.lastIndexOf(b,d+b.length),-1!=h&&(g=Math.min(c(0,h),g)));var l=1<=k;t--){var u=e[a.charAt(t-1)];m[t]=0===w?(m[t+1]<<1|1)&u:(m[t+1]<<1|1)&u|(x[t+1]|x[t])<<1|1|x[t+1];if(m[t]&l&&(u=c(w,t-1),u<=g))if(g=u,h=t-1,h>d)k=Math.max(1,2*d-h);else break}if(c(w+1,d)>g)break;x=m}return h}; -diff_match_patch.prototype.match_alphabet_=function(a){for(var b={},d=0;d=2*this.Patch_Margin&&e&&(this.patch_addContext_(a,h),d.push(a),a=new diff_match_patch.patch_obj,e=0,h=c,f=g)}k!==DIFF_INSERT&&(f+=m.length);k!==DIFF_DELETE&&(g+=m.length)}e&&(this.patch_addContext_(a,h),d.push(a));return d}; -diff_match_patch.prototype.patch_deepCopy=function(a){for(var b=[],d=0;dthis.Match_MaxBits){var k=this.match_main(b,h.substring(0,this.Match_MaxBits),g);-1!=k&&(l=this.match_main(b,h.substring(h.length-this.Match_MaxBits),g+h.length-this.Match_MaxBits),-1==l||k>=l)&&(k=-1)}else k=this.match_main(b,h, -g);if(-1==k)e[f]=!1,c-=a[f].length2-a[f].length1;else if(e[f]=!0,c=k-g,g=-1==l?b.substring(k,k+h.length):b.substring(k,l+this.Match_MaxBits),h==g)b=b.substring(0,k)+this.diff_text2(a[f].diffs)+b.substring(k+h.length);else if(g=this.diff_main(h,g,!1),h.length>this.Match_MaxBits&&this.diff_levenshtein(g)/h.length>this.Patch_DeleteThreshold)e[f]=!1;else{this.diff_cleanupSemanticLossless(g);h=0;var m;for(l=0;le[0][1].length){var f=b-e[0][1].length;e[0][1]=d.substring(e[0][1].length)+e[0][1];c.start1-=f;c.start2-=f;c.length1+=f;c.length2+=f}c=a[a.length-1];e=c.diffs; -0==e.length||e[e.length-1][0]!=DIFF_EQUAL?(e.push(new diff_match_patch.Diff(DIFF_EQUAL,d)),c.length1+=b,c.length2+=b):b>e[e.length-1][1].length&&(f=b-e[e.length-1][1].length,e[e.length-1][1]+=d.substring(0,f),c.length1+=f,c.length2+=f);return d}; -diff_match_patch.prototype.patch_splitMax=function(a){for(var b=this.Match_MaxBits,d=0;d2*b?(h.length1+=k.length,e+=k.length,l=!1,h.diffs.push(new diff_match_patch.Diff(g,k)),c.diffs.shift()):(k=k.substring(0,b-h.length1-this.Patch_Margin),h.length1+=k.length,e+=k.length,g===DIFF_EQUAL?(h.length2+=k.length,f+=k.length):l=!1,h.diffs.push(new diff_match_patch.Diff(g,k)),k==c.diffs[0][1]?c.diffs.shift():c.diffs[0][1]=c.diffs[0][1].substring(k.length))}g=this.diff_text2(h.diffs); -g=g.substring(g.length-this.Patch_Margin);k=this.diff_text1(c.diffs).substring(0,this.Patch_Margin);""!==k&&(h.length1+=k.length,h.length2+=k.length,0!==h.diffs.length&&h.diffs[h.diffs.length-1][0]===DIFF_EQUAL?h.diffs[h.diffs.length-1][1]+=k:h.diffs.push(new diff_match_patch.Diff(DIFF_EQUAL,k)));l||a.splice(++d,0,h)}}};diff_match_patch.prototype.patch_toText=function(a){for(var b=[],d=0;d=k&&(k=m,g=d,h=e,l=f)}a[c-1][1]!=g&&(g?a[c-1][1]=g:(a.splice(c- +1,1),c--),a[c][1]=h,l?a[c+1][1]=l:(a.splice(c+1,1),c--))}c++}};diff_match_patch.nonAlphaNumericRegex_=/[^a-zA-Z0-9]/;diff_match_patch.whitespaceRegex_=/\s/;diff_match_patch.linebreakRegex_=/[\r\n]/;diff_match_patch.blanklineEndRegex_=/\n\r?\n$/;diff_match_patch.blanklineStartRegex_=/^\r?\n\r?\n/; +diff_match_patch.prototype.diff_cleanupEfficiency=function(a){for(var b=!1,c=[],d=0,e=null,f=0,g=!1,h=!1,l=!1,k=!1;fb)break;e=c;f=d}return a.length!=g&&a[g][0]===DIFF_DELETE?f:f+(b-e)}; +diff_match_patch.prototype.diff_prettyHtml=function(a){for(var b=[],c=/&/g,d=//g,f=/\n/g,g=0;g");switch(h){case DIFF_INSERT:b[g]=''+l+"";break;case DIFF_DELETE:b[g]=''+l+"";break;case DIFF_EQUAL:b[g]=""+l+""}}return b.join("")}; +diff_match_patch.prototype.diff_text1=function(a){for(var b=[],c=0;c=a};diff_match_patch.prototype.isLowSurrogate=function(a){a=a.charCodeAt(0);return 56320<=a&&57343>=a}; +diff_match_patch.prototype.diff_toDelta=function(a){for(var b=[],c,d=0;dthis.Match_MaxBits)throw Error("Pattern too long for this browser.");var e=this.match_alphabet_(b),f=this,g=this.Match_Threshold,h=a.indexOf(b,c);-1!=h&&(g=Math.min(d(0,h),g),h=a.lastIndexOf(b,c+b.length),-1!=h&&(g=Math.min(d(0,h),g)));var l=1<=k;q--){var t=e[a.charAt(q-1)];m[q]=0===w?(m[q+1]<<1|1)&t:(m[q+1]<<1|1)&t|(x[q+1]|x[q])<<1|1|x[q+1];if(m[q]&l&&(t=d(w,q-1),t<=g))if(g=t,h=q-1,h>c)k=Math.max(1,2*c-h);else break}if(d(w+1,c)>g)break;x=m}return h}; +diff_match_patch.prototype.match_alphabet_=function(a){for(var b={},c=0;c=2*this.Patch_Margin&&e&&(this.patch_addContext_(a,h),c.push(a),a=new diff_match_patch.patch_obj,e=0,h=d,f=g)}k!==DIFF_INSERT&&(f+=m.length);k!==DIFF_DELETE&&(g+=m.length)}e&&(this.patch_addContext_(a,h),c.push(a));return c}; +diff_match_patch.prototype.patch_deepCopy=function(a){for(var b=[],c=0;cthis.Match_MaxBits){var k=this.match_main(b,h.substring(0,this.Match_MaxBits),g);-1!=k&&(l=this.match_main(b,h.substring(h.length-this.Match_MaxBits),g+h.length-this.Match_MaxBits),-1==l||k>=l)&&(k=-1)}else k=this.match_main(b,h, +g);if(-1==k)e[f]=!1,d-=a[f].length2-a[f].length1;else if(e[f]=!0,d=k-g,g=-1==l?b.substring(k,k+h.length):b.substring(k,l+this.Match_MaxBits),h==g)b=b.substring(0,k)+this.diff_text2(a[f].diffs)+b.substring(k+h.length);else if(g=this.diff_main(h,g,!1),h.length>this.Match_MaxBits&&this.diff_levenshtein(g)/h.length>this.Patch_DeleteThreshold)e[f]=!1;else{this.diff_cleanupSemanticLossless(g);h=0;var m;for(l=0;le[0][1].length){var f=b-e[0][1].length;e[0][1]=c.substring(e[0][1].length)+e[0][1];d.start1-=f;d.start2-=f;d.length1+=f;d.length2+=f}d=a[a.length-1];e=d.diffs; +0==e.length||e[e.length-1][0]!=DIFF_EQUAL?(e.push(new diff_match_patch.Diff(DIFF_EQUAL,c)),d.length1+=b,d.length2+=b):b>e[e.length-1][1].length&&(f=b-e[e.length-1][1].length,e[e.length-1][1]+=c.substring(0,f),d.length1+=f,d.length2+=f);return c}; +diff_match_patch.prototype.patch_splitMax=function(a){for(var b=this.Match_MaxBits,c=0;c2*b?(h.length1+=k.length,e+=k.length,l=!1,h.diffs.push(new diff_match_patch.Diff(g,k)),d.diffs.shift()):(k=k.substring(0,b-h.length1-this.Patch_Margin),h.length1+=k.length,e+=k.length,g===DIFF_EQUAL?(h.length2+=k.length,f+=k.length):l=!1,h.diffs.push(new diff_match_patch.Diff(g,k)),k==d.diffs[0][1]?d.diffs.shift():d.diffs[0][1]=d.diffs[0][1].substring(k.length))}g=this.diff_text2(h.diffs); +g=g.substring(g.length-this.Patch_Margin);k=this.diff_text1(d.diffs).substring(0,this.Patch_Margin);""!==k&&(h.length1+=k.length,h.length2+=k.length,0!==h.diffs.length&&h.diffs[h.diffs.length-1][0]===DIFF_EQUAL?h.diffs[h.diffs.length-1][1]+=k:h.diffs.push(new diff_match_patch.Diff(DIFF_EQUAL,k)));l||a.splice(++c,0,h)}}};diff_match_patch.prototype.patch_toText=function(a){for(var b=[],c=0;c Date: Thu, 12 Dec 2019 23:23:23 -0700 Subject: [PATCH 14/34] Augment diff_fromDelta with custom decodeURI for encoded surrogate halves Because sometimes we get a patch that was built in Python or another library that will happily URI-encode half of a surrogate pair. --- javascript/diff_match_patch.js | 103 ++++++++++---------- javascript/diff_match_patch_uncompressed.js | 96 +++++++++++++++++- javascript/tests/diff_match_patch_test.js | 16 ++- 3 files changed, 160 insertions(+), 55 deletions(-) diff --git a/javascript/diff_match_patch.js b/javascript/diff_match_patch.js index 4ba112a..37bd784 100644 --- a/javascript/diff_match_patch.js +++ b/javascript/diff_match_patch.js @@ -1,56 +1,59 @@ var diff_match_patch=function(){this.Diff_Timeout=1;this.Diff_EditCost=4;this.Match_Threshold=.5;this.Match_Distance=1E3;this.Patch_DeleteThreshold=.5;this.Patch_Margin=4;this.Match_MaxBits=32},DIFF_DELETE=-1,DIFF_INSERT=1,DIFF_EQUAL=0;diff_match_patch.Diff=function(a,b){this[0]=a;this[1]=b};diff_match_patch.Diff.prototype.length=2;diff_match_patch.Diff.prototype.toString=function(){return this[0]+","+this[1]}; -diff_match_patch.prototype.diff_main=function(a,b,c,d){"undefined"==typeof d&&(d=0>=this.Diff_Timeout?Number.MAX_VALUE:(new Date).getTime()+1E3*this.Diff_Timeout);if(null==a||null==b)throw Error("Null input. (diff_main)");if(a==b)return a?[new diff_match_patch.Diff(DIFF_EQUAL,a)]:[];"undefined"==typeof c&&(c=!0);var e=c,f=this.diff_commonPrefix(a,b);c=a.substring(0,f);a=a.substring(f);b=b.substring(f);f=this.diff_commonSuffix(a,b);var g=a.substring(a.length-f);a=a.substring(0,a.length-f);b=b.substring(0, -b.length-f);a=this.diff_compute_(a,b,e,d);c&&a.unshift(new diff_match_patch.Diff(DIFF_EQUAL,c));g&&a.push(new diff_match_patch.Diff(DIFF_EQUAL,g));this.diff_cleanupMerge(a);return a}; -diff_match_patch.prototype.diff_compute_=function(a,b,c,d){if(!a)return[new diff_match_patch.Diff(DIFF_INSERT,b)];if(!b)return[new diff_match_patch.Diff(DIFF_DELETE,a)];var e=a.length>b.length?a:b,f=a.length>b.length?b:a,g=e.indexOf(f);return-1!=g?(c=[new diff_match_patch.Diff(DIFF_INSERT,e.substring(0,g)),new diff_match_patch.Diff(DIFF_EQUAL,f),new diff_match_patch.Diff(DIFF_INSERT,e.substring(g+f.length))],a.length>b.length&&(c[0][0]=c[2][0]=DIFF_DELETE),c):1==f.length?[new diff_match_patch.Diff(DIFF_DELETE, -a),new diff_match_patch.Diff(DIFF_INSERT,b)]:(e=this.diff_halfMatch_(a,b))?(b=e[1],f=e[3],a=e[4],e=this.diff_main(e[0],e[2],c,d),c=this.diff_main(b,f,c,d),e.concat([new diff_match_patch.Diff(DIFF_EQUAL,a)],c)):c&&100c);t++){for(var v=-t+p;v<=t-x;v+=2){var n=f+v;var r=v==-t||v!=t&&h[n-1]d)x+=2;else if(y>e)p+=2;else if(m&&(n=f+k-v,0<=n&&n= -u)return this.diff_bisectSplit_(a,b,r,y,c)}}for(v=-t+w;v<=t-q;v+=2){n=f+v;u=v==-t||v!=t&&l[n-1]d)q+=2;else if(r>e)w+=2;else if(!m&&(n=f+k-v,0<=n&&n=u)))return this.diff_bisectSplit_(a,b,r,y,c)}}return[new diff_match_patch.Diff(DIFF_DELETE,a),new diff_match_patch.Diff(DIFF_INSERT,b)]}; -diff_match_patch.prototype.diff_bisectSplit_=function(a,b,c,d,e){var f=a.substring(0,c),g=b.substring(0,d);a=a.substring(c);b=b.substring(d);f=this.diff_main(f,g,!1,e);e=this.diff_main(a,b,!1,e);return f.concat(e)}; -diff_match_patch.prototype.diff_linesToChars_=function(a,b){function c(a){for(var b="",c=0,g=-1,h=d.length;gd?a=a.substring(c-d):c=a.length?[h,k,l,m,g]:null}if(0>=this.Diff_Timeout)return null; -var d=a.length>b.length?a:b,e=a.length>b.length?b:a;if(4>d.length||2*e.lengthd[4].length?g:d:d:g;else return null;if(a.length>b.length){d=g[0];e=g[1];var h=g[2];var l=g[3]}else h=g[0],l=g[1],d=g[2],e=g[3];return[d,e,h,l,g[4]]}; -diff_match_patch.prototype.diff_cleanupSemantic=function(a){for(var b=!1,c=[],d=0,e=null,f=0,g=0,h=0,l=0,k=0;f=e){if(d>=b.length/2||d>=c.length/2)a.splice(f,0,new diff_match_patch.Diff(DIFF_EQUAL,c.substring(0,d))),a[f-1][1]=b.substring(0,b.length-d),a[f+1][1]=c.substring(d),f++}else if(e>=b.length/2||e>=c.length/2)a.splice(f,0,new diff_match_patch.Diff(DIFF_EQUAL,b.substring(0,e))),a[f-1][0]=DIFF_INSERT,a[f-1][1]=c.substring(0,c.length-e),a[f+1][0]=DIFF_DELETE, +diff_match_patch.prototype.diff_main=function(a,b,d,c){"undefined"==typeof c&&(c=0>=this.Diff_Timeout?Number.MAX_VALUE:(new Date).getTime()+1E3*this.Diff_Timeout);if(null==a||null==b)throw Error("Null input. (diff_main)");if(a==b)return a?[new diff_match_patch.Diff(DIFF_EQUAL,a)]:[];"undefined"==typeof d&&(d=!0);var e=d,f=this.diff_commonPrefix(a,b);d=a.substring(0,f);a=a.substring(f);b=b.substring(f);f=this.diff_commonSuffix(a,b);var g=a.substring(a.length-f);a=a.substring(0,a.length-f);b=b.substring(0, +b.length-f);a=this.diff_compute_(a,b,e,c);d&&a.unshift(new diff_match_patch.Diff(DIFF_EQUAL,d));g&&a.push(new diff_match_patch.Diff(DIFF_EQUAL,g));this.diff_cleanupMerge(a);return a}; +diff_match_patch.prototype.diff_compute_=function(a,b,d,c){if(!a)return[new diff_match_patch.Diff(DIFF_INSERT,b)];if(!b)return[new diff_match_patch.Diff(DIFF_DELETE,a)];var e=a.length>b.length?a:b,f=a.length>b.length?b:a,g=e.indexOf(f);return-1!=g?(d=[new diff_match_patch.Diff(DIFF_INSERT,e.substring(0,g)),new diff_match_patch.Diff(DIFF_EQUAL,f),new diff_match_patch.Diff(DIFF_INSERT,e.substring(g+f.length))],a.length>b.length&&(d[0][0]=d[2][0]=DIFF_DELETE),d):1==f.length?[new diff_match_patch.Diff(DIFF_DELETE, +a),new diff_match_patch.Diff(DIFF_INSERT,b)]:(e=this.diff_halfMatch_(a,b))?(b=e[1],f=e[3],a=e[4],e=this.diff_main(e[0],e[2],d,c),d=this.diff_main(b,f,d,c),e.concat([new diff_match_patch.Diff(DIFF_EQUAL,a)],d)):d&&100d);t++){for(var v=-t+p;v<=t-x;v+=2){var n=f+v;var r=v==-t||v!=t&&h[n-1]c)x+=2;else if(y>e)p+=2;else if(m&&(n=f+k-v,0<=n&&n= +u)return this.diff_bisectSplit_(a,b,r,y,d)}}for(v=-t+w;v<=t-q;v+=2){n=f+v;u=v==-t||v!=t&&l[n-1]c)q+=2;else if(r>e)w+=2;else if(!m&&(n=f+k-v,0<=n&&n=u)))return this.diff_bisectSplit_(a,b,r,y,d)}}return[new diff_match_patch.Diff(DIFF_DELETE,a),new diff_match_patch.Diff(DIFF_INSERT,b)]}; +diff_match_patch.prototype.diff_bisectSplit_=function(a,b,d,c,e){var f=a.substring(0,d),g=b.substring(0,c);a=a.substring(d);b=b.substring(c);f=this.diff_main(f,g,!1,e);e=this.diff_main(a,b,!1,e);return f.concat(e)}; +diff_match_patch.prototype.diff_linesToChars_=function(a,b){function d(a){for(var b="",d=0,g=-1,h=c.length;gc?a=a.substring(d-c):d=a.length?[h,k,l,m,g]:null}if(0>=this.Diff_Timeout)return null; +var c=a.length>b.length?a:b,e=a.length>b.length?b:a;if(4>c.length||2*e.lengthc[4].length?g:c:c:g;else return null;if(a.length>b.length){c=g[0];e=g[1];var h=g[2];var l=g[3]}else h=g[0],l=g[1],c=g[2],e=g[3];return[c,e,h,l,g[4]]}; +diff_match_patch.prototype.diff_cleanupSemantic=function(a){for(var b=!1,d=[],c=0,e=null,f=0,g=0,h=0,l=0,k=0;f=e){if(c>=b.length/2||c>=d.length/2)a.splice(f,0,new diff_match_patch.Diff(DIFF_EQUAL,d.substring(0,c))),a[f-1][1]=b.substring(0,b.length-c),a[f+1][1]=d.substring(c),f++}else if(e>=b.length/2||e>=d.length/2)a.splice(f,0,new diff_match_patch.Diff(DIFF_EQUAL,b.substring(0,e))),a[f-1][0]=DIFF_INSERT,a[f-1][1]=d.substring(0,d.length-e),a[f+1][0]=DIFF_DELETE, a[f+1][1]=b.substring(e),f++;f++}f++}}; diff_match_patch.prototype.diff_cleanupSemanticLossless=function(a){function b(a,b){if(!a||!b)return 6;var c=a.charAt(a.length-1),d=b.charAt(0),e=c.match(diff_match_patch.nonAlphaNumericRegex_),f=d.match(diff_match_patch.nonAlphaNumericRegex_),g=e&&c.match(diff_match_patch.whitespaceRegex_),h=f&&d.match(diff_match_patch.whitespaceRegex_);c=g&&c.match(diff_match_patch.linebreakRegex_);d=h&&d.match(diff_match_patch.linebreakRegex_);var k=c&&a.match(diff_match_patch.blanklineEndRegex_),l=d&&b.match(diff_match_patch.blanklineStartRegex_); -return k||l?5:c||d?4:e&&!g&&h?3:g||h?2:e||f?1:0}for(var c=1;c=k&&(k=m,g=d,h=e,l=f)}a[c-1][1]!=g&&(g?a[c-1][1]=g:(a.splice(c- -1,1),c--),a[c][1]=h,l?a[c+1][1]=l:(a.splice(c+1,1),c--))}c++}};diff_match_patch.nonAlphaNumericRegex_=/[^a-zA-Z0-9]/;diff_match_patch.whitespaceRegex_=/\s/;diff_match_patch.linebreakRegex_=/[\r\n]/;diff_match_patch.blanklineEndRegex_=/\n\r?\n$/;diff_match_patch.blanklineStartRegex_=/^\r?\n\r?\n/; -diff_match_patch.prototype.diff_cleanupEfficiency=function(a){for(var b=!1,c=[],d=0,e=null,f=0,g=!1,h=!1,l=!1,k=!1;fb)break;e=c;f=d}return a.length!=g&&a[g][0]===DIFF_DELETE?f:f+(b-e)}; -diff_match_patch.prototype.diff_prettyHtml=function(a){for(var b=[],c=/&/g,d=//g,f=/\n/g,g=0;g");switch(h){case DIFF_INSERT:b[g]=''+l+"";break;case DIFF_DELETE:b[g]=''+l+"";break;case DIFF_EQUAL:b[g]=""+l+""}}return b.join("")}; -diff_match_patch.prototype.diff_text1=function(a){for(var b=[],c=0;c=a};diff_match_patch.prototype.isLowSurrogate=function(a){a=a.charCodeAt(0);return 56320<=a&&57343>=a}; -diff_match_patch.prototype.diff_toDelta=function(a){for(var b=[],c,d=0;d=k&&(k=m,g=c,h=e,l=f)}a[d-1][1]!=g&&(g?a[d-1][1]=g:(a.splice(d- +1,1),d--),a[d][1]=h,l?a[d+1][1]=l:(a.splice(d+1,1),d--))}d++}};diff_match_patch.nonAlphaNumericRegex_=/[^a-zA-Z0-9]/;diff_match_patch.whitespaceRegex_=/\s/;diff_match_patch.linebreakRegex_=/[\r\n]/;diff_match_patch.blanklineEndRegex_=/\n\r?\n$/;diff_match_patch.blanklineStartRegex_=/^\r?\n\r?\n/; +diff_match_patch.prototype.diff_cleanupEfficiency=function(a){for(var b=!1,d=[],c=0,e=null,f=0,g=!1,h=!1,l=!1,k=!1;fb)break;e=d;f=c}return a.length!=g&&a[g][0]===DIFF_DELETE?f:f+(b-e)}; +diff_match_patch.prototype.diff_prettyHtml=function(a){for(var b=[],d=/&/g,c=//g,f=/\n/g,g=0;g");switch(h){case DIFF_INSERT:b[g]=''+l+"";break;case DIFF_DELETE:b[g]=''+l+"";break;case DIFF_EQUAL:b[g]=""+l+""}}return b.join("")}; +diff_match_patch.prototype.diff_text1=function(a){for(var b=[],d=0;d=a};diff_match_patch.prototype.isLowSurrogate=function(a){a=a.charCodeAt(0);return 56320<=a&&57343>=a}; +diff_match_patch.prototype.diff_toDelta=function(a){for(var b=[],d,c=0;cthis.Match_MaxBits)throw Error("Pattern too long for this browser.");var e=this.match_alphabet_(b),f=this,g=this.Match_Threshold,h=a.indexOf(b,c);-1!=h&&(g=Math.min(d(0,h),g),h=a.lastIndexOf(b,c+b.length),-1!=h&&(g=Math.min(d(0,h),g)));var l=1<=k;q--){var t=e[a.charAt(q-1)];m[q]=0===w?(m[q+1]<<1|1)&t:(m[q+1]<<1|1)&t|(x[q+1]|x[q])<<1|1|x[q+1];if(m[q]&l&&(t=d(w,q-1),t<=g))if(g=t,h=q-1,h>c)k=Math.max(1,2*c-h);else break}if(d(w+1,c)>g)break;x=m}return h}; -diff_match_patch.prototype.match_alphabet_=function(a){for(var b={},c=0;c=2*this.Patch_Margin&&e&&(this.patch_addContext_(a,h),c.push(a),a=new diff_match_patch.patch_obj,e=0,h=d,f=g)}k!==DIFF_INSERT&&(f+=m.length);k!==DIFF_DELETE&&(g+=m.length)}e&&(this.patch_addContext_(a,h),c.push(a));return c}; -diff_match_patch.prototype.patch_deepCopy=function(a){for(var b=[],c=0;cthis.Match_MaxBits){var k=this.match_main(b,h.substring(0,this.Match_MaxBits),g);-1!=k&&(l=this.match_main(b,h.substring(h.length-this.Match_MaxBits),g+h.length-this.Match_MaxBits),-1==l||k>=l)&&(k=-1)}else k=this.match_main(b,h, -g);if(-1==k)e[f]=!1,d-=a[f].length2-a[f].length1;else if(e[f]=!0,d=k-g,g=-1==l?b.substring(k,k+h.length):b.substring(k,l+this.Match_MaxBits),h==g)b=b.substring(0,k)+this.diff_text2(a[f].diffs)+b.substring(k+h.length);else if(g=this.diff_main(h,g,!1),h.length>this.Match_MaxBits&&this.diff_levenshtein(g)/h.length>this.Patch_DeleteThreshold)e[f]=!1;else{this.diff_cleanupSemanticLossless(g);h=0;var m;for(l=0;le[0][1].length){var f=b-e[0][1].length;e[0][1]=c.substring(e[0][1].length)+e[0][1];d.start1-=f;d.start2-=f;d.length1+=f;d.length2+=f}d=a[a.length-1];e=d.diffs; -0==e.length||e[e.length-1][0]!=DIFF_EQUAL?(e.push(new diff_match_patch.Diff(DIFF_EQUAL,c)),d.length1+=b,d.length2+=b):b>e[e.length-1][1].length&&(f=b-e[e.length-1][1].length,e[e.length-1][1]+=c.substring(0,f),d.length1+=f,d.length2+=f);return c}; -diff_match_patch.prototype.patch_splitMax=function(a){for(var b=this.Match_MaxBits,c=0;c2*b?(h.length1+=k.length,e+=k.length,l=!1,h.diffs.push(new diff_match_patch.Diff(g,k)),d.diffs.shift()):(k=k.substring(0,b-h.length1-this.Patch_Margin),h.length1+=k.length,e+=k.length,g===DIFF_EQUAL?(h.length2+=k.length,f+=k.length):l=!1,h.diffs.push(new diff_match_patch.Diff(g,k)),k==d.diffs[0][1]?d.diffs.shift():d.diffs[0][1]=d.diffs[0][1].substring(k.length))}g=this.diff_text2(h.diffs); -g=g.substring(g.length-this.Patch_Margin);k=this.diff_text1(d.diffs).substring(0,this.Patch_Margin);""!==k&&(h.length1+=k.length,h.length2+=k.length,0!==h.diffs.length&&h.diffs[h.diffs.length-1][0]===DIFF_EQUAL?h.diffs[h.diffs.length-1][1]+=k:h.diffs.push(new diff_match_patch.Diff(DIFF_EQUAL,k)));l||a.splice(++c,0,h)}}};diff_match_patch.prototype.patch_toText=function(a){for(var b=[],c=0;c=c)){d+=String.fromCharCode((c&65535)>>>10&1023|55296);d+=String.fromCharCode(56320|c&1023);b+=12;continue}throw new URIError("URI malformed"); +}}}}return d}}; +diff_match_patch.prototype.diff_fromDelta=function(a,b){for(var d=[],c=0,e=0,f=b.split(/\t/g),g=0;gthis.Match_MaxBits)throw Error("Pattern too long for this browser.");var e=this.match_alphabet_(b),f=this,g=this.Match_Threshold,h=a.indexOf(b,d);-1!=h&&(g=Math.min(c(0,h),g),h=a.lastIndexOf(b,d+b.length),-1!=h&&(g=Math.min(c(0,h),g)));var l=1<=k;q--){var t=e[a.charAt(q-1)];m[q]=0===w?(m[q+1]<<1|1)&t:(m[q+1]<<1|1)&t|(x[q+1]|x[q])<<1|1|x[q+1];if(m[q]&l&&(t=c(w,q-1),t<=g))if(g=t,h=q-1,h>d)k=Math.max(1,2*d-h);else break}if(c(w+1,d)>g)break;x=m}return h}; +diff_match_patch.prototype.match_alphabet_=function(a){for(var b={},d=0;d=2*this.Patch_Margin&&e&&(this.patch_addContext_(a,h),d.push(a),a=new diff_match_patch.patch_obj,e=0,h=c,f=g)}k!==DIFF_INSERT&&(f+=m.length);k!==DIFF_DELETE&&(g+=m.length)}e&&(this.patch_addContext_(a,h),d.push(a));return d}; +diff_match_patch.prototype.patch_deepCopy=function(a){for(var b=[],d=0;dthis.Match_MaxBits){var k=this.match_main(b,h.substring(0,this.Match_MaxBits),g);-1!=k&&(l=this.match_main(b,h.substring(h.length-this.Match_MaxBits),g+h.length-this.Match_MaxBits),-1==l||k>=l)&&(k=-1)}else k=this.match_main(b,h, +g);if(-1==k)e[f]=!1,c-=a[f].length2-a[f].length1;else if(e[f]=!0,c=k-g,g=-1==l?b.substring(k,k+h.length):b.substring(k,l+this.Match_MaxBits),h==g)b=b.substring(0,k)+this.diff_text2(a[f].diffs)+b.substring(k+h.length);else if(g=this.diff_main(h,g,!1),h.length>this.Match_MaxBits&&this.diff_levenshtein(g)/h.length>this.Patch_DeleteThreshold)e[f]=!1;else{this.diff_cleanupSemanticLossless(g);h=0;var m;for(l=0;le[0][1].length){var f=b-e[0][1].length;e[0][1]=d.substring(e[0][1].length)+e[0][1];c.start1-=f;c.start2-=f;c.length1+=f;c.length2+=f}c=a[a.length-1];e=c.diffs; +0==e.length||e[e.length-1][0]!=DIFF_EQUAL?(e.push(new diff_match_patch.Diff(DIFF_EQUAL,d)),c.length1+=b,c.length2+=b):b>e[e.length-1][1].length&&(f=b-e[e.length-1][1].length,e[e.length-1][1]+=d.substring(0,f),c.length1+=f,c.length2+=f);return d}; +diff_match_patch.prototype.patch_splitMax=function(a){for(var b=this.Match_MaxBits,d=0;d2*b?(h.length1+=k.length,e+=k.length,l=!1,h.diffs.push(new diff_match_patch.Diff(g,k)),c.diffs.shift()):(k=k.substring(0,b-h.length1-this.Patch_Margin),h.length1+=k.length,e+=k.length,g===DIFF_EQUAL?(h.length2+=k.length,f+=k.length):l=!1,h.diffs.push(new diff_match_patch.Diff(g,k)),k==c.diffs[0][1]?c.diffs.shift():c.diffs[0][1]=c.diffs[0][1].substring(k.length))}g=this.diff_text2(h.diffs); +g=g.substring(g.length-this.Patch_Margin);k=this.diff_text1(c.diffs).substring(0,this.Patch_Margin);""!==k&&(h.length1+=k.length,h.length2+=k.length,0!==h.diffs.length&&h.diffs[h.diffs.length-1][0]===DIFF_EQUAL?h.diffs[h.diffs.length-1][1]+=k:h.diffs.push(new diff_match_patch.Diff(DIFF_EQUAL,k)));l||a.splice(++d,0,h)}}};diff_match_patch.prototype.patch_toText=function(a){for(var b=[],d=0;d= 0x010000 && codePoint <= 0x10FFFF) { + decoded += String.fromCharCode((codePoint & 0xFFFF) >>> 10 & 0x3FF | 0xD800); + decoded += String.fromCharCode(0xDC00 | (codePoint & 0xFFFF) & 0x3FF); + i += 12; + continue; + } + } + + throw new URIError('URI malformed'); + } + + return decoded; + } +}; /** * Given the original text1, and an encoded string which describes the @@ -1428,7 +1522,7 @@ diff_match_patch.prototype.diff_fromDelta = function(text1, delta) { case '+': try { diffs[diffsLength++] = - new diff_match_patch.Diff(DIFF_INSERT, decodeURI(param)); + new diff_match_patch.Diff(DIFF_INSERT, this.decodeURI(param)); } catch (ex) { // Malformed URI sequence. throw new Error('Illegal escape in diff_fromDelta: ' + param); diff --git a/javascript/tests/diff_match_patch_test.js b/javascript/tests/diff_match_patch_test.js index 4283f9d..4222ce6 100644 --- a/javascript/tests/diff_match_patch_test.js +++ b/javascript/tests/diff_match_patch_test.js @@ -580,12 +580,20 @@ function testDiffDelta() { // Empty diff groups assertEquivalent( - JSON.stringify(dmp.diff_toDelta([[DIFF_EQUAL, 'abcdef'], [DIFF_DELETE, ''], [DIFF_INSERT, 'ghijk']])), - JSON.stringify(dmp.diff_toDelta([[DIFF_EQUAL, 'abcdef'], [DIFF_INSERT, 'ghijk']])), + dmp.diff_toDelta([[DIFF_EQUAL, 'abcdef'], [DIFF_DELETE, ''], [DIFF_INSERT, 'ghijk']]), + dmp.diff_toDelta([[DIFF_EQUAL, 'abcdef'], [DIFF_INSERT, 'ghijk']]), ); - // Invalid UTF8 but valid surrogate pairs - + // Different versions of the library may have created deltas with + // half of a surrogate pair encoded as if it were valid UTF-8 + try { + assertEquivalent( + dmp.diff_toDelta(dmp.diff_fromDelta('\ud83c\udd70', '-2\t+%F0%9F%85%B1')), + dmp.diff_toDelta(dmp.diff_fromDelta('\ud83c\udd70', '=1\t-1\t+%ED%B5%B1')) + ); + } catch ( e ) { + assertEquals('Decode UTF8-encoded surrogate half', 'crashed'); + } // Verify pool of unchanged characters. diffs = [[DIFF_INSERT, 'A-Z a-z 0-9 - _ . ! ~ * \' ( ) ; / ? : @ & = + $ , # ']]; From 7e5a643dec0d84bfd531413bec144331067273d0 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 13 Dec 2019 00:56:49 -0700 Subject: [PATCH 15/34] Port the improvements to Java --- .../neil/plaintext/diff_match_patch.java | 89 +++++++++++++++++-- .../neil/plaintext/diff_match_patch_test.java | 32 +++++++ 2 files changed, 116 insertions(+), 5 deletions(-) diff --git a/java/src/name/fraser/neil/plaintext/diff_match_patch.java b/java/src/name/fraser/neil/plaintext/diff_match_patch.java index d5ecb2c..fe00bc1 100644 --- a/java/src/name/fraser/neil/plaintext/diff_match_patch.java +++ b/java/src/name/fraser/neil/plaintext/diff_match_patch.java @@ -1433,6 +1433,9 @@ public String diff_toDelta(List diffs) { char lastEnd = 0; boolean isFirst = true; for (Diff aDiff : diffs) { + if (aDiff.text.isEmpty()) { + continue; + } char thisTop = aDiff.text.charAt(0); char thisEnd = aDiff.text.charAt(aDiff.text.length() - 1); @@ -1446,7 +1449,11 @@ public String diff_toDelta(List diffs) { } isFirst = false; - lastEnd = thisEnd; + + if (aDiff.operation == Operation.EQUAL) { + lastEnd = thisEnd; + } + if ( aDiff.text.isEmpty() ) { continue; } @@ -1478,6 +1485,81 @@ public String diff_toDelta(List diffs) { return delta; } + private String decodeURI(String text) throws IllegalArgumentException { + int i = 0; + StringBuffer decoded = new StringBuffer(""); + + while (i < text.length()) { + if ( text.charAt(i) != '%' ) { + decoded.append(text.charAt(i++)); + continue; + } + + // start a percent-sequence + int byte1 = Integer.parseInt(text.substring(i + 1, i + 3), 16); + + if ((byte1 & 0x80) == 0) { + decoded.append(Character.toChars(byte1)); + i += 3; + continue; + } + + if ( text.charAt(i + 3) != '%') { + throw new IllegalArgumentException(); + } + + int byte2 = Integer.parseInt(text.substring(i + 4, i + 6), 16); + if ((byte2 & 0xC0) != 0x80) { + throw new IllegalArgumentException(); + } + byte2 = byte2 & 0x3F; + if ((byte1 & 0xE0) == 0xC0) { + decoded.append(Character.toChars(((byte1 & 0x1F) << 6) | byte2)); + i += 6; + continue; + } + + if (text.charAt(i + 6) != '%') { + throw new IllegalArgumentException(); + } + + int byte3 = Integer.parseInt(text.substring(i + 7, i + 9), 16); + if ((byte3 & 0xC0) != 0x80) { + throw new IllegalArgumentException(); + } + byte3 = byte3 & 0x3F; + if ((byte1 & 0xF0) == 0xE0) { + // unpaired surrogate are fine here + decoded.append(Character.toChars(((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3)); + i += 9; + continue; + } + + if (text.charAt(i + 9) != '%') { + throw new IllegalArgumentException(); + } + + int byte4 = Integer.parseInt(text.substring(i + 10, i + 12), 16); + if ((byte4 & 0xC0) != 0x80) { + throw new IllegalArgumentException(); + } + byte4 = byte4 & 0x3F; + if ((byte1 & 0xF8) == 0xF0) { + int codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) | (byte3 << 0x06) | byte4; + if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) { + decoded.append(Character.toChars((codePoint & 0xFFFF) >>> 10 & 0x3FF | 0xD800)); + decoded.append(Character.toChars(0xDC00 | (codePoint & 0xFFFF) & 0x3FF)); + i += 12; + continue; + } + } + + throw new IllegalArgumentException(); + } + + return decoded.toString(); + } + /** * Given the original text1, and an encoded string which describes the * operations required to transform text1 into text2, compute the full diff. @@ -1504,10 +1586,7 @@ public LinkedList diff_fromDelta(String text1, String delta) // decode would change all "+" to " " param = param.replace("+", "%2B"); try { - param = URLDecoder.decode(param, "UTF-8"); - } catch (UnsupportedEncodingException e) { - // Not likely on modern system. - throw new Error("This system does not support UTF-8.", e); + param = this.decodeURI(param); } catch (IllegalArgumentException e) { // Malformed URI sequence. throw new IllegalArgumentException( diff --git a/java/tests/name/fraser/neil/plaintext/diff_match_patch_test.java b/java/tests/name/fraser/neil/plaintext/diff_match_patch_test.java index aef98ce..5be10f1 100644 --- a/java/tests/name/fraser/neil/plaintext/diff_match_patch_test.java +++ b/java/tests/name/fraser/neil/plaintext/diff_match_patch_test.java @@ -428,6 +428,38 @@ public static void testDiffDelta() { delta = dmp.diff_toDelta(diffs); assertEquals("diff_toDelta: Surrogate Pairs.", "=2\t+%F0%9F%99%8C\t=2", delta); + assertEquals( + "diff_toDelta: insert surrogate pair between similar high surrogates", + dmp.diff_toDelta(diffList(new Diff(EQUAL, "\ud83c\udd70"), new Diff(INSERT, "\ud83c\udd70"), new Diff(EQUAL, "\ud83c\udd71"))), + dmp.diff_toDelta(diffList(new Diff(EQUAL, "\ud83c\udd70\ud83c"), new Diff(INSERT, "\udd70\ud83c"), new Diff(EQUAL, "\udd71"))) + ); + + assertEquals( + "diff_toDelta: swap surrogate pairs delete/insert", + dmp.diff_toDelta(diffList(new Diff(DELETE, "\ud83c\udd70"), new Diff(INSERT, "\ud83c\udd71"))), + dmp.diff_toDelta(diffList(new Diff(EQUAL, "\ud83c"), new Diff(DELETE, "\udd70"), new Diff(INSERT, "\udd71"))) + ); + + assertEquals( + "diff_toDelta: swap surrogate pairs insert/delete", + dmp.diff_toDelta(diffList(new Diff(INSERT, "\ud83c\udd70"), new Diff(DELETE, "\ud83c\udd71"))), + dmp.diff_toDelta(diffList(new Diff(EQUAL, "\ud83c"), new Diff(INSERT, "\udd70"), new Diff(DELETE, "\udd71"))) + ); + + assertEquals( + "diff_toDelta: empty diff groups", + dmp.diff_toDelta(diffList(new Diff(EQUAL, "abcdef"), new Diff(DELETE, ""), new Diff(INSERT, "ghijk"))), + dmp.diff_toDelta(diffList(new Diff(EQUAL, "abcdef"), new Diff(INSERT, "ghijk"))) + ); + + // Different versions of the library may have created deltas with + // half of a surrogate pair encoded as if it were valid UTF-8 + assertEquals( + "diff_toDelta: surrogate half encoded as UTF8", + dmp.diff_toDelta(dmp.diff_fromDelta("\ud83c\udd70", "-2\t+%F0%9F%85%B1")), + dmp.diff_toDelta(dmp.diff_fromDelta("\ud83c\udd70", "=1\t-1\t+%ED%B5%B1")) + ); + // Verify pool of unchanged characters. diffs = diffList(new Diff(INSERT, "A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # ")); String text2 = dmp.diff_text2(diffs); From 42bc948729868067b8f90fe2a0b8671de9c9d2b8 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 13 Dec 2019 01:56:56 -0700 Subject: [PATCH 16/34] Java: Use lookup vs. parseInt() for URI decoding --- .../neil/plaintext/diff_match_patch.java | 39 ++++++++++++++----- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/java/src/name/fraser/neil/plaintext/diff_match_patch.java b/java/src/name/fraser/neil/plaintext/diff_match_patch.java index fe00bc1..2d52098 100644 --- a/java/src/name/fraser/neil/plaintext/diff_match_patch.java +++ b/java/src/name/fraser/neil/plaintext/diff_match_patch.java @@ -1485,19 +1485,41 @@ public String diff_toDelta(List diffs) { return delta; } + private int digit16(char b) throws IllegalArgumentException { + switch (b) { + case '0': return 0; + case '1': return 1; + case '2': return 2; + case '3': return 3; + case '4': return 4; + case '5': return 5; + case '6': return 6; + case '7': return 7; + case '8': return 8; + case '9': return 9; + case 'A': case 'a': return 10; + case 'B': case 'b': return 11; + case 'C': case 'c': return 12; + case 'D': case 'd': return 13; + case 'E': case 'e': return 14; + case 'F': case 'f': return 15; + default: + throw new IllegalArgumentException(); + } + } + private String decodeURI(String text) throws IllegalArgumentException { int i = 0; StringBuffer decoded = new StringBuffer(""); while (i < text.length()) { - if ( text.charAt(i) != '%' ) { + if (text.charAt(i) != '%') { decoded.append(text.charAt(i++)); continue; } // start a percent-sequence - int byte1 = Integer.parseInt(text.substring(i + 1, i + 3), 16); - + int byte1 = (digit16(text.charAt(i + 1)) << 4) + digit16(text.charAt(i + 2)); if ((byte1 & 0x80) == 0) { decoded.append(Character.toChars(byte1)); i += 3; @@ -1508,7 +1530,7 @@ private String decodeURI(String text) throws IllegalArgumentException { throw new IllegalArgumentException(); } - int byte2 = Integer.parseInt(text.substring(i + 4, i + 6), 16); + int byte2 = (digit16(text.charAt(i + 4)) << 4) + digit16(text.charAt(i + 5)); if ((byte2 & 0xC0) != 0x80) { throw new IllegalArgumentException(); } @@ -1523,7 +1545,7 @@ private String decodeURI(String text) throws IllegalArgumentException { throw new IllegalArgumentException(); } - int byte3 = Integer.parseInt(text.substring(i + 7, i + 9), 16); + int byte3 = (digit16(text.charAt(i + 7)) << 4) + digit16(text.charAt(i + 8)); if ((byte3 & 0xC0) != 0x80) { throw new IllegalArgumentException(); } @@ -1539,7 +1561,7 @@ private String decodeURI(String text) throws IllegalArgumentException { throw new IllegalArgumentException(); } - int byte4 = Integer.parseInt(text.substring(i + 10, i + 12), 16); + int byte4 = (digit16(text.charAt(i + 10)) << 4) + digit16(text.charAt(i + 11)); if ((byte4 & 0xC0) != 0x80) { throw new IllegalArgumentException(); } @@ -2369,10 +2391,7 @@ public List patch_fromText(String textline) line = text.getFirst().substring(1); line = line.replace("+", "%2B"); // decode would change all "+" to " " try { - line = URLDecoder.decode(line, "UTF-8"); - } catch (UnsupportedEncodingException e) { - // Not likely on modern system. - throw new Error("This system does not support UTF-8.", e); + line = this.decodeURI(line); } catch (IllegalArgumentException e) { // Malformed URI sequence. throw new IllegalArgumentException( From 7eeca252642c9629d570dcd99742b61b859443e6 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 16 Dec 2019 12:38:28 -0700 Subject: [PATCH 17/34] Add more test cases, update code and decoder --- .../neil/plaintext/diff_match_patch.java | 5 +--- javascript/diff_match_patch_uncompressed.js | 6 ---- javascript/tests/diff_match_patch_test.js | 29 ++++++++++++++++++- 3 files changed, 29 insertions(+), 11 deletions(-) diff --git a/java/src/name/fraser/neil/plaintext/diff_match_patch.java b/java/src/name/fraser/neil/plaintext/diff_match_patch.java index 2d52098..9f3f71e 100644 --- a/java/src/name/fraser/neil/plaintext/diff_match_patch.java +++ b/java/src/name/fraser/neil/plaintext/diff_match_patch.java @@ -1441,6 +1441,7 @@ public String diff_toDelta(List diffs) { char thisEnd = aDiff.text.charAt(aDiff.text.length() - 1); if (Character.isHighSurrogate(thisEnd)) { + lastEnd = thisEnd; aDiff.text = aDiff.text.substring(0, aDiff.text.length() - 1); } @@ -1450,10 +1451,6 @@ public String diff_toDelta(List diffs) { isFirst = false; - if (aDiff.operation == Operation.EQUAL) { - lastEnd = thisEnd; - } - if ( aDiff.text.isEmpty() ) { continue; } diff --git a/javascript/diff_match_patch_uncompressed.js b/javascript/diff_match_patch_uncompressed.js index 0d2821d..b15c26c 100644 --- a/javascript/diff_match_patch_uncompressed.js +++ b/javascript/diff_match_patch_uncompressed.js @@ -1381,12 +1381,6 @@ diff_match_patch.prototype.diff_toDelta = function(diffs) { thisDiff[1] = lastEnd + thisDiff[1]; } - // we have to carry the surrogate half through - // any successive insert/delete edits - if (DIFF_EQUAL === thisDiff[0]) { - lastEnd = thisEnd; - } - if (0 === thisDiff[1].length) { continue; } diff --git a/javascript/tests/diff_match_patch_test.js b/javascript/tests/diff_match_patch_test.js index 4222ce6..35d9de4 100644 --- a/javascript/tests/diff_match_patch_test.js +++ b/javascript/tests/diff_match_patch_test.js @@ -551,13 +551,40 @@ function testDiffDelta() { })(); // Unicode - splitting surrogates + try { + assertEquivalent( + dmp.diff_toDelta([[DIFF_INSERT,'\ud83c\udd71'], [DIFF_EQUAL, '\ud83c\udd70\ud83c\udd71']]), + dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd71', '\ud83c\udd71\ud83c\udd70\ud83c\udd71')) + ); + } catch ( e ) { + assertEquals('Inserting similar surrogate pair at beginning', 'crashed'); + } + try { assertEquivalent( dmp.diff_toDelta([[DIFF_EQUAL,'\ud83c\udd70'], [DIFF_INSERT, '\ud83c\udd70'], [DIFF_EQUAL, '\ud83c\udd71']]), dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd71', '\ud83c\udd70\ud83c\udd70\ud83c\udd71')) ); } catch ( e ) { - assertEquals('Inserting similar surrogate pair', 'crashed'); + assertEquals('Inserting similar surrogate pair in the middle', 'crashed'); + } + + try { + assertEquivalent( + dmp.diff_toDelta([[DIFF_DELETE,'\ud83c\udd71'], [DIFF_EQUAL, '\ud83c\udd70\ud83c\udd71']]), + dmp.diff_toDelta(dmp.diff_main('\ud83c\udd71\ud83c\udd70\ud83c\udd71', '\ud83c\udd70\ud83c\udd71')) + ); + } catch ( e ) { + assertEquals('Deleting similar surrogate pair at the beginning', 'crashed'); + } + + try { + assertEquivalent( + dmp.diff_toDelta([[DIFF_EQUAL, '\ud83c\udd70'], [DIFF_DELETE,'\ud83c\udd72'], [DIFF_EQUAL, '\ud83c\udd71']]), + dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd72\ud83c\udd71', '\ud83c\udd70\ud83c\udd71')) + ); + } catch ( e ) { + assertEquals('Deleting similar surrogate pair in the middle', 'crashed'); } try { From 1d166fec7e005bfbbb456ae9817be075bba20e7e Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 16 Dec 2019 12:44:00 -0700 Subject: [PATCH 18/34] Apply patch fix to objective-c --- objectivec/DiffMatchPatch.m | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/objectivec/DiffMatchPatch.m b/objectivec/DiffMatchPatch.m index 495a07b..ba0ab76 100755 --- a/objectivec/DiffMatchPatch.m +++ b/objectivec/DiffMatchPatch.m @@ -1301,11 +1301,15 @@ - (NSString *)diff_toDelta:(NSMutableArray *)diffs; NSMutableString *delta = [NSMutableString string]; UniChar lastEnd = 0; for (Diff *aDiff in diffs) { + if (0 == [aDiff.text length]) { + continue; + } UniChar thisTop = [aDiff.text characterAtIndex:0]; UniChar thisEnd = [aDiff.text characterAtIndex:([aDiff.text length]-1)]; if (CFStringIsSurrogateHighCharacter(thisEnd)) { + lastEnd = thisEnd; aDiff.text = [aDiff.text substringToIndex:([aDiff.text length] - 1)]; } @@ -1313,7 +1317,6 @@ - (NSString *)diff_toDelta:(NSMutableArray *)diffs; aDiff.text = [NSString stringWithFormat:@"%C%@", lastEnd, aDiff.text]; } - lastEnd = thisEnd; if (0 == [aDiff.text length]) { continue; } From 9dede7fd48a06242b39b92878cfa1bd6d8375442 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 16 Dec 2019 13:27:14 -0700 Subject: [PATCH 19/34] Add objective-c tests --- objectivec/Tests/DiffMatchPatchTest.m | 50 +++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/objectivec/Tests/DiffMatchPatchTest.m b/objectivec/Tests/DiffMatchPatchTest.m index 2869492..8761aa0 100755 --- a/objectivec/Tests/DiffMatchPatchTest.m +++ b/objectivec/Tests/DiffMatchPatchTest.m @@ -764,6 +764,56 @@ - (void)test_diff_deltaTest { expectedString = [patchResult firstObject]; XCTAssertEqualObjects(@"β˜ΊοΈπŸ˜ƒπŸ––πŸΏ", expectedString, @"Output String should match the Edited one!"); + // Unicode - splitting surrogates + + // Inserting similar surrogate pair at beginning + diffs = [NSMutableArray arrayWithObjects: + [Diff diffWithOperation:DIFF_INSERT andText:@"πŸ…±"], + [Diff diffWithOperation:DIFF_EQUAL andText:@"πŸ…°πŸ…±"], + nil]; + XCTAssertEqualObjects( [dmp diff_toDelta:diffs], [dmp diff_toDelta:[dmp diff_mainOfOldString:@"πŸ…°πŸ…±" andNewString:@"πŸ…±πŸ…°πŸ…±"]]); + + // Inserting similar surrogate pair in the middle + diffs = [NSMutableArray arrayWithObjects: + [Diff diffWithOperation:DIFF_EQUAL andText:@"πŸ…°"], + [Diff diffWithOperation:DIFF_INSERT andText:@"πŸ…°"], + [Diff diffWithOperation:DIFF_EQUAL andText:@"πŸ…±"], + nil]; + XCTAssertEqualObjects( [dmp diff_toDelta:diffs], [dmp diff_toDelta:[dmp diff_mainOfOldString:@"πŸ…°πŸ…±" andNewString:@"πŸ…°πŸ…°πŸ…±"]]); + + // Deleting similar surrogate pair at the beginning + diffs = [NSMutableArray arrayWithObjects: + [Diff diffWithOperation:DIFF_DELETE andText:@"πŸ…±"], + [Diff diffWithOperation:DIFF_EQUAL andText:@"πŸ…°πŸ…±"], + nil]; + XCTAssertEqualObjects( [dmp diff_toDelta:diffs], [dmp diff_toDelta:[dmp diff_mainOfOldString:@"πŸ…±πŸ…°πŸ…±" andNewString:@"πŸ…°πŸ…±"]]); + + // Deleting similar surrogate pair in the middle + diffs = [NSMutableArray arrayWithObjects: + [Diff diffWithOperation:DIFF_EQUAL andText:@"πŸ…°"], + [Diff diffWithOperation:DIFF_DELETE andText:@"πŸ…²"], + [Diff diffWithOperation:DIFF_EQUAL andText:@"πŸ…±"], + nil]; + XCTAssertEqualObjects( [dmp diff_toDelta:diffs], [dmp diff_toDelta:[dmp diff_mainOfOldString:@"πŸ…°πŸ…²πŸ…±" andNewString:@"πŸ…°πŸ…±"]]); + + // Swapping surrogate pairs + diffs = [NSMutableArray arrayWithObjects: + [Diff diffWithOperation:DIFF_DELETE andText:@"πŸ…°"], + [Diff diffWithOperation:DIFF_INSERT andText:@"πŸ…±"], + nil]; + XCTAssertEqualObjects( [dmp diff_toDelta:diffs], [dmp diff_toDelta:[dmp diff_mainOfOldString:@"πŸ…°" andNewString:@"πŸ…±"]]); + + // Swapping surrogate pairs + XCTAssertEqualObjects( [dmp diff_toDelta:([NSMutableArray arrayWithObjects: + [Diff diffWithOperation:DIFF_DELETE andText:@"πŸ…°"], + [Diff diffWithOperation:DIFF_INSERT andText:@"πŸ…±"], + nil])], + [dmp diff_toDelta:([NSMutableArray arrayWithObjects: + [Diff diffWithOperation:DIFF_EQUAL andText:[NSString stringWithFormat:@"%C", 0xd83c]], + [Diff diffWithOperation:DIFF_DELETE andText:[NSString stringWithFormat:@"%C", 0xdd70]], + [Diff diffWithOperation:DIFF_INSERT andText:[NSString stringWithFormat:@"%C", 0xdd71]], + nil])]); + // Verify pool of unchanged characters. diffs = [NSMutableArray arrayWithObject: [Diff diffWithOperation:DIFF_INSERT andText:@"A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # "]]; From dbada062e3ebbe7bb39ba9d187b39a02740f6dfa Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 16 Dec 2019 13:57:57 -0700 Subject: [PATCH 20/34] JavaScript: Use `digit16` instead of `parseInt()` There's no need to validate the entire range of input integers. Using `digit16()` will be faster and give us more precise validation. --- javascript/diff_match_patch_uncompressed.js | 31 +++++++++++++++++---- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/javascript/diff_match_patch_uncompressed.js b/javascript/diff_match_patch_uncompressed.js index b15c26c..67bee74 100644 --- a/javascript/diff_match_patch_uncompressed.js +++ b/javascript/diff_match_patch_uncompressed.js @@ -1400,6 +1400,28 @@ diff_match_patch.prototype.diff_toDelta = function(diffs) { return text.join('\t').replace(/%20/g, ' '); }; +diff_match_patch.prototype.digit16 = function(c) { + switch (c) { + case '0': return 0; + case '1': return 1; + case '2': return 2; + case '3': return 3; + case '4': return 4; + case '5': return 5; + case '6': return 6; + case '7': return 7; + case '8': return 8; + case '9': return 9; + case 'A': case 'a': return 10; + case 'B': case 'b': return 11; + case 'C': case 'c': return 12; + case 'D': case 'd': return 13; + case 'E': case 'e': return 14; + case 'F': case 'f': return 15; + default: throw new Error('Invalid hex-code'); + } +}; + /** * Decode URI-encoded string but allow for encoded surrogate halves * @@ -1430,8 +1452,7 @@ diff_match_patch.prototype.decodeURI = function(text) { } // start a percent-sequence - var byte1 = parseInt(text.substring(i + 1, i + 3), 16); - + var byte1 = (this.digit16(text[i + 1]) << 4) + this.digit16(text[i + 2]); if ((byte1 & 0x80) === 0) { decoded += String.fromCharCode(byte1); i += 3; @@ -1442,7 +1463,7 @@ diff_match_patch.prototype.decodeURI = function(text) { throw new URIError('URI malformed'); } - var byte2 = parseInt(text.substring(i + 4, i + 6), 16); + var byte2 = (this.digit16(text[i + 4]) << 4) + this.digit16(text[i + 5]); if ((byte2 & 0xC0) !== 0x80) { throw new URIError('URI malformed'); } @@ -1457,7 +1478,7 @@ diff_match_patch.prototype.decodeURI = function(text) { throw new URIError('URI malformed'); } - var byte3 = parseInt(text.substring(i + 7, i + 9), 16); + var byte3 = (this.digit16(text[i + 7]) << 4) + this.digit16(text[i + 8]); if ((byte3 & 0xC0) !== 0x80) { throw new URIError('URI malformed'); } @@ -1473,7 +1494,7 @@ diff_match_patch.prototype.decodeURI = function(text) { throw new URIError('URI malformed'); } - var byte4 = parseInt(text.substring(i + 10, i + 12), 16); + var byte4 = (this.digit16(text[i + 10]) << 4) + this.digit16(text[i + 11]); if ((byte4 & 0xC0) !== 0x80) { throw new URIError('URI malformed'); } From 6e03198e3102327d358c7e88e1427eddfe6e6f19 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 16 Dec 2019 14:24:56 -0700 Subject: [PATCH 21/34] Objective-C: Introduce failing test for receiving an invalid patch --- objectivec/Tests/DiffMatchPatchTest.m | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/objectivec/Tests/DiffMatchPatchTest.m b/objectivec/Tests/DiffMatchPatchTest.m index 8761aa0..7e31508 100755 --- a/objectivec/Tests/DiffMatchPatchTest.m +++ b/objectivec/Tests/DiffMatchPatchTest.m @@ -843,6 +843,11 @@ - (void)test_diff_deltaTest { expectedResult = [dmp diff_fromDeltaWithText:@"" andDelta:delta error:NULL]; XCTAssertEqualObjects(diffs, expectedResult, @"diff_fromDelta: 160kb string. Convert delta string into a diff."); + // Different versions of the library may have created deltas with + // half of a surrogate pair encoded as if it were valid UTF-8 + XCTAssertEqualObjects([dmp diff_toDelta:([dmp diff_fromDeltaWithText:@"πŸ…°" andDelta:@"-2\t+%F0%9F%85%B1" error:NULL])], + [dmp diff_toDelta:([dmp diff_fromDeltaWithText:@"πŸ…°" andDelta:@"=1\t-1\t+%ED%B5%B1" error:NULL])]); + [dmp release]; } From 989d6c6e78dd835a3aaa5d0e6bfcea26f4b97b08 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 16 Dec 2019 17:44:28 -0700 Subject: [PATCH 22/34] Python: Update patch and handle wide/narrow compilation Python can be compiled in "narrow" mode or "wide" mode which determines how wide the internal string units are. In narrow mode we have UCS-2 code units and higher-order Unicode code points will be stored as surrogate pairs. In wide mode we have UCS-4 code units and so all Unicode code points will be stored as a single item in the string. This patch incorporate a decision based on the internal string width to run the native-looking `toDelta` or the _encoded_ version. In the _encoded_ version we explicitly encode the string to `utf-16be` for consistency sake with the other libraries. We _could_ always run the encoded version but I suspect that the native version will be faster and many deployments will be running narrow mode. --- python2/diff_match_patch.py | 68 +++++++++++++++++++++++++++++-------- 1 file changed, 53 insertions(+), 15 deletions(-) diff --git a/python2/diff_match_patch.py b/python2/diff_match_patch.py index 71cbd19..1738bd6 100644 --- a/python2/diff_match_patch.py +++ b/python2/diff_match_patch.py @@ -1137,16 +1137,47 @@ def diff_levenshtein(self, diffs): return levenshtein @classmethod - def is_high_surrogate(cls, utf16be_bytes): - c = struct.unpack('>H', utf16be_bytes)[0] - return c >= 0xd800 and c <= 0xdbff + def is_high_surrogate(cls, c): + return 0xd800 <= c <= 0xdbff @classmethod - def is_low_surrogate(cls, utf16be_bytes): - c = struct.unpack('>H', utf16be_bytes)[0] - return c >= 0xdc00 and c <= 0xdfff + def is_low_surrogate(cls, c): + return 0xdc00 <= c <= 0xdfff - def diff_toDelta(self, diffs): + @classmethod + def ucs2ord(cls, utf16be_bytes): + return struct.unpack('>H', utf16be_bytes)[0] + + def diff_toDelta_narrow(self, diffs): + text = [] + last_end = None + for (op, data) in diffs: + if 0 == len(data): + continue + + this_top = data[0] + this_end = data[-1] + + if self.is_high_surrogate(ord(this_end)): + last_end = this_end + data = data[:-1] + + if last_end and self.is_high_surrogate(ord(last_end)) and self.is_low_surrogate(ord(this_top)): + data = last_end + data + + if 0 == len(data): + continue + + if op == self.DIFF_INSERT: + # High ascii will raise UnicodeDecodeError. Use Unicode instead. + text.append("+" + urllib.quote(data.encode('utf-8'), "!~*'();/?:@&=+$,# ")) + elif op == self.DIFF_DELETE: + text.append("-%d" % len(data)) + elif op == self.DIFF_EQUAL: + text.append("=%d" % len(data)) + return "\t".join(text) + + def diff_toDelta_wide(self, diffs): """Crush the diff into an encoded string which describes the operations required to transform text1 into text2. E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'. @@ -1161,31 +1192,38 @@ def diff_toDelta(self, diffs): text = [] last_end = None for (op, data) in diffs: + if 0 == len(data): + continue + encoded = data.encode('utf-16be') this_top = encoded[0:2] this_end = encoded[-2:] - if self.is_high_surrogate(this_end): + if self.is_high_surrogate(self.ucs2ord(this_end)): + last_end = this_end encoded = encoded[0:-2] - if last_end and self.is_high_surrogate(last_end) and self.is_low_surrogate(this_top): + if last_end and self.is_high_surrogate(self.ucs2ord(last_end)) and self.is_low_surrogate(self.ucs2ord(this_top)): encoded = last_end + encoded - data = encoded.decode('utf-16be') - last_end = this_end if 0 == len(encoded): continue if op == self.DIFF_INSERT: # High ascii will raise UnicodeDecodeError. Use Unicode instead. - data = data.encode("utf-8") - text.append("+" + urllib.quote(data, "!~*'();/?:@&=+$,# ")) + text.append("+" + urllib.quote(encoded.decode('utf-16be').encode('utf-8'), "!~*'();/?:@&=+$,# ")) elif op == self.DIFF_DELETE: - text.append("-%d" % (len(data.encode('utf-16be')) // 2)) + text.append("-%d" % (len(encoded) // 2)) elif op == self.DIFF_EQUAL: - text.append("=%d" % (len(data.encode('utf-16be')) // 2)) + text.append("=%d" % (len(encoded) // 2)) return "\t".join(text) + def diff_toDelta(self, diffs, encode_data = sys.maxunicode == 0xFFFF): + if encode_data: + return self.diff_toDelta_wide(diffs) + else: + return self.diff_toDelta_narrow(diffs) + def diff_fromDelta(self, text1, delta): """Given the original text1, and an encoded string which describes the operations required to transform text1 into text2, compute the full diff. From 22cb3961e2f84c2e69d1a96c663962144e1d0edd Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 16 Dec 2019 17:59:16 -0700 Subject: [PATCH 23/34] Python: Always run encoded toDelta In testing with `speedtest.py` I found no significant performance impact for running the "native" narrow-mode `toDelta()` compared to running the fully-encoding version that operates on bytes. For the sake of simplicity I'm removing the narrow version. --- python2/diff_match_patch.py | 49 ++++--------------------------------- 1 file changed, 5 insertions(+), 44 deletions(-) diff --git a/python2/diff_match_patch.py b/python2/diff_match_patch.py index 1738bd6..8b26125 100644 --- a/python2/diff_match_patch.py +++ b/python2/diff_match_patch.py @@ -1138,46 +1138,13 @@ def diff_levenshtein(self, diffs): @classmethod def is_high_surrogate(cls, c): - return 0xd800 <= c <= 0xdbff + return 0xd800 <= struct.unpack('>H', c)[0] <= 0xdbff @classmethod def is_low_surrogate(cls, c): - return 0xdc00 <= c <= 0xdfff + return 0xdc00 <= struct.unpack('>H', c)[0] <= 0xdfff - @classmethod - def ucs2ord(cls, utf16be_bytes): - return struct.unpack('>H', utf16be_bytes)[0] - - def diff_toDelta_narrow(self, diffs): - text = [] - last_end = None - for (op, data) in diffs: - if 0 == len(data): - continue - - this_top = data[0] - this_end = data[-1] - - if self.is_high_surrogate(ord(this_end)): - last_end = this_end - data = data[:-1] - - if last_end and self.is_high_surrogate(ord(last_end)) and self.is_low_surrogate(ord(this_top)): - data = last_end + data - - if 0 == len(data): - continue - - if op == self.DIFF_INSERT: - # High ascii will raise UnicodeDecodeError. Use Unicode instead. - text.append("+" + urllib.quote(data.encode('utf-8'), "!~*'();/?:@&=+$,# ")) - elif op == self.DIFF_DELETE: - text.append("-%d" % len(data)) - elif op == self.DIFF_EQUAL: - text.append("=%d" % len(data)) - return "\t".join(text) - - def diff_toDelta_wide(self, diffs): + def diff_toDelta(self, diffs): """Crush the diff into an encoded string which describes the operations required to transform text1 into text2. E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'. @@ -1199,11 +1166,11 @@ def diff_toDelta_wide(self, diffs): this_top = encoded[0:2] this_end = encoded[-2:] - if self.is_high_surrogate(self.ucs2ord(this_end)): + if self.is_high_surrogate(this_end): last_end = this_end encoded = encoded[0:-2] - if last_end and self.is_high_surrogate(self.ucs2ord(last_end)) and self.is_low_surrogate(self.ucs2ord(this_top)): + if last_end and self.is_high_surrogate(last_end) and self.is_low_surrogate(this_top): encoded = last_end + encoded if 0 == len(encoded): @@ -1218,12 +1185,6 @@ def diff_toDelta_wide(self, diffs): text.append("=%d" % (len(encoded) // 2)) return "\t".join(text) - def diff_toDelta(self, diffs, encode_data = sys.maxunicode == 0xFFFF): - if encode_data: - return self.diff_toDelta_wide(diffs) - else: - return self.diff_toDelta_narrow(diffs) - def diff_fromDelta(self, text1, delta): """Given the original text1, and an encoded string which describes the operations required to transform text1 into text2, compute the full diff. From 3c31e6273f19bb2dfce28f4dd4654e03af8fdf1c Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Tue, 17 Dec 2019 16:48:21 -0700 Subject: [PATCH 24/34] Python: Add tests and update Python3 toDelta fix --- python2/tests/diff_match_patch_test.py | 76 ++++++++++++++++++++++++++ python3/diff_match_patch.py | 3 + python3/tests/diff_match_patch_test.py | 63 +++++++++++++++++++++ 3 files changed, 142 insertions(+) diff --git a/python2/tests/diff_match_patch_test.py b/python2/tests/diff_match_patch_test.py index fc633bc..94f5fd3 100644 --- a/python2/tests/diff_match_patch_test.py +++ b/python2/tests/diff_match_patch_test.py @@ -445,6 +445,82 @@ def testDiffDelta(self): delta = self.dmp.diff_toDelta(diffs) self.assertEquals("=2\t+%F0%9F%99%8C\t=2", delta) + # Unicode: split surrogates + # Inserting similar surrogate pair at beginning + self.assertEquals( + self.dmp.diff_toDelta([ + (self.dmp.DIFF_INSERT, u'\U0001F171'), + (self.dmp.DIFF_EQUAL, u'\U0001F170\U0001F171') + ]), + self.dmp.diff_toDelta(self.dmp.diff_main( + u'\U0001F170\U0001F171', + u'\U0001F171\U0001F170\U0001F171' + )) + ) + + # Inserting similar surrogate pair in the middle + self.assertEquals( + self.dmp.diff_toDelta([ + (self.dmp.DIFF_EQUAL, u'\U0001F170'), + (self.dmp.DIFF_INSERT, u'\U0001F172'), + (self.dmp.DIFF_EQUAL, u'\U0001F171') + ]), + self.dmp.diff_toDelta(self.dmp.diff_main( + u'\U0001F170\U0001F171', + u'\U0001F170\U0001F172\U0001F171' + )) + ) + + # Deleting similar surogate pair at the beginning + self.assertEquals( + self.dmp.diff_toDelta([ + (self.dmp.DIFF_DELETE, u'\U0001F171'), + (self.dmp.DIFF_EQUAL, u'\U0001F170\U0001F171') + ]), + self.dmp.diff_toDelta(self.dmp.diff_main( + u'\U0001F171\U0001F170\U0001F171', + u'\U0001F170\U0001F171' + )) + ) + + # Deleting similar surogate pair in the middle + self.assertEquals( + self.dmp.diff_toDelta([ + (self.dmp.DIFF_EQUAL, u'\U0001F170'), + (self.dmp.DIFF_DELETE, u'\U0001F172'), + (self.dmp.DIFF_EQUAL, u'\U0001F171') + ]), + self.dmp.diff_toDelta(self.dmp.diff_main( + u'\U0001F170\U0001F172\U0001F171', + u'\U0001F170\U0001F171' + )) + ) + + # Swap surrogate pair + self.assertEquals( + self.dmp.diff_toDelta([ + (self.dmp.DIFF_DELETE, u'\U0001F170'), + (self.dmp.DIFF_INSERT, u'\U0001F171') + ]), + self.dmp.diff_toDelta(self.dmp.diff_main( + u'\U0001F170', + u'\U0001F171' + )) + ) + + # Swap surrogate pair, force the invalid diff groups + self.assertEquals( + self.dmp.diff_toDelta([ + (self.dmp.DIFF_INSERT, u'\U0001F170'), + (self.dmp.DIFF_DELETE, u'\U0001F171') + ]), + self.dmp.diff_toDelta([ + (self.dmp.DIFF_EQUAL, u'\ud83c'), + (self.dmp.DIFF_INSERT, u'\udd70'), + (self.dmp.DIFF_DELETE, u'\udd71') + ]) + ) + # Verify pool of unchanged characters. diffs = [(self.dmp.DIFF_INSERT, "A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # ")] text2 = self.dmp.diff_text2(diffs) diff --git a/python3/diff_match_patch.py b/python3/diff_match_patch.py index 99aa853..3bf825c 100644 --- a/python3/diff_match_patch.py +++ b/python3/diff_match_patch.py @@ -1148,6 +1148,9 @@ def diff_toDelta(self, diffs): """ text = [] for (op, data) in diffs: + if 0 == len(data): + continue + if op == self.DIFF_INSERT: # High ascii will raise UnicodeDecodeError. Use Unicode instead. data = data.encode("utf-8") diff --git a/python3/tests/diff_match_patch_test.py b/python3/tests/diff_match_patch_test.py index 9474762..4ff16ab 100644 --- a/python3/tests/diff_match_patch_test.py +++ b/python3/tests/diff_match_patch_test.py @@ -462,6 +462,69 @@ def testDiffDelta(self): # Convert delta string into a diff. self.assertEqual(diffs, self.dmp.diff_fromDelta("", delta)) + # Unicode: split surrogates + self.assertEqual( + self.dmp.diff_toDelta([ + (self.dmp.DIFF_INSERT, '\U0001F171'), + (self.dmp.DIFF_EQUAL, '\U0001F170\U0001F171') + ]), + self.dmp.diff_toDelta(self.dmp.diff_main( + '\U0001F170\U0001F171', + '\U0001F171\U0001F170\U0001F171' + )), + 'Inserting similar surrogate pair at beginning' + ) + + self.assertEqual( + self.dmp.diff_toDelta([ + (self.dmp.DIFF_EQUAL, '\U0001F170'), + (self.dmp.DIFF_INSERT, '\U0001F172'), + (self.dmp.DIFF_EQUAL, '\U0001F171') + ]), + self.dmp.diff_toDelta(self.dmp.diff_main( + '\U0001F170\U0001F171', + '\U0001F170\U0001F172\U0001F171' + )), + 'Inserting similar surrogate pair in the middle' + ) + + self.assertEqual( + self.dmp.diff_toDelta([ + (self.dmp.DIFF_DELETE, '\U0001F171'), + (self.dmp.DIFF_EQUAL, '\U0001F170\U0001F171') + ]), + self.dmp.diff_toDelta(self.dmp.diff_main( + '\U0001F171\U0001F170\U0001F171', + '\U0001F170\U0001F171' + )), + 'Deleting similar surogate pair at the beginning' + ) + + self.assertEqual( + self.dmp.diff_toDelta([ + (self.dmp.DIFF_EQUAL, '\U0001F170'), + (self.dmp.DIFF_DELETE, '\U0001F172'), + (self.dmp.DIFF_EQUAL, '\U0001F171') + ]), + self.dmp.diff_toDelta(self.dmp.diff_main( + '\U0001F170\U0001F172\U0001F171', + '\U0001F170\U0001F171' + )), + 'Deleting similar surogate pair in the middle' + ) + + self.assertEqual( + self.dmp.diff_toDelta([ + (self.dmp.DIFF_DELETE, '\U0001F170'), + (self.dmp.DIFF_INSERT, '\U0001F171') + ]), + self.dmp.diff_toDelta(self.dmp.diff_main( + '\U0001F170', + '\U0001F171' + )), + 'Swap surrogate pair' + ) + # 160 kb string. a = "abcdefghij" for i in range(14): From df217532a84dfbc63fa8049f94a331776421ce3a Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 19 Dec 2019 10:14:34 -0700 Subject: [PATCH 25/34] Update Objective-C code to handle invalid incoming deltas --- objectivec/DiffMatchPatch.m | 123 +++++++++++++++++++++++++++++++++++- 1 file changed, 122 insertions(+), 1 deletion(-) diff --git a/objectivec/DiffMatchPatch.m b/objectivec/DiffMatchPatch.m index ba0ab76..486c0b3 100755 --- a/objectivec/DiffMatchPatch.m +++ b/objectivec/DiffMatchPatch.m @@ -1342,6 +1342,127 @@ - (NSString *)diff_toDelta:(NSMutableArray *)diffs; return delta; } +- (int)diff_digit16:(unichar)c +{ + switch (c) { + case '0': return 0; + case '1': return 1; + case '2': return 2; + case '3': return 3; + case '4': return 4; + case '5': return 5; + case '6': return 6; + case '7': return 7; + case '8': return 8; + case '9': return 9; + case 'A': case 'a': return 10; + case 'B': case 'b': return 11; + case 'C': case 'c': return 12; + case 'D': case 'd': return 13; + case 'E': case 'e': return 14; + case 'F': case 'f': return 15; + default: + [NSException raise:@"Invalid percent-encoded string" format:@"%c is not a hex digit", c]; + } +} + +- (NSString *)diff_decodeURIWithText:(NSString *)percentEncoded +{ + unichar decoded[[percentEncoded length]]; + int input = 0; + int output = 0; + + @try { + while (input < [percentEncoded length]) { + unichar c = [percentEncoded characterAtIndex:input]; + + if ('%' != c) { + decoded[output++] = c; + input += 1; + continue; + } + + int byte1 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+1)]] << 4) + + [self diff_digit16:[percentEncoded characterAtIndex:(input+2)]]; + + if ((byte1 & 0x80) == 0) { + decoded[output++] = byte1; + input += 3; + continue; + } + + if ('%' != [percentEncoded characterAtIndex:(input + 3)]) { + [NSException raise:@"Invalid percent-encoded string" format:@"Cannot decode UTF-8 sequence: %@", percentEncoded]; + } + + int byte2 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+4)]] << 4) + + [self diff_digit16:[percentEncoded characterAtIndex:(input+5)]]; + + if ((byte2 & 0xC0) != 0x80) { + [NSException raise:@"Invalid percent-encoded string" format:@"Cannot decode UTF-8 sequence: %@", percentEncoded]; + } + + byte2 = byte2 & 0x3F; + + if ((byte1 & 0xE0) == 0xC0) { + decoded[output++] = ((byte1 & 0x1F) << 6) | byte2; + input += 6; + continue; + } + + if ('%' != [percentEncoded characterAtIndex:(input + 6)]) { + [NSException raise:@"Invalid percent-encoded string" format:@"Cannot decode UTF-8 sequence: %@", percentEncoded]; + } + + int byte3 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+7)]] << 4) + + [self diff_digit16:[percentEncoded characterAtIndex:(input+8)]]; + + if ((byte3 & 0xC0) != 0x80) { + [NSException raise:@"Invalid percent-encoded string" format:@"Cannot decode UTF-8 sequence: %@", percentEncoded]; + } + + byte3 = byte3 & 0x3F; + + if ((byte1 & 0xF0) == 0xE0) { + decoded[output++] = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3; + input += 9; + continue; + } + + if ('%' != [percentEncoded characterAtIndex:(input + 9)]) { + [NSException raise:@"Invalid percent-encoded string" format:@"Cannot decode UTF-8 sequence: %@", percentEncoded]; + } + + int byte4 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+10)]] << 4) + + [self diff_digit16:[percentEncoded characterAtIndex:(input+11)]]; + + if ((byte4 & 0xC0) != 0x80) { + [NSException raise:@"Invalid percent-encoded string" format:@"Cannot decode UTF-8 sequence: %@", percentEncoded]; + } + + byte4 = byte4 & 0x3F; + + if ((byte1 & 0xF8) == 0xF0) { + int codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) | (byte3 << 0x06) | byte4; + if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) { + codePoint -= 0x010000; + decoded[output++] = ((codePoint >> 10) & 0x3FF) | 0xD800; + decoded[output++] = 0xDC00 | (codePoint & 0x3FF); + input += 12; + continue; + } + } + + [NSException raise:@"Invalid percent-encoded string" format:@"Cannot decode UTF-8 sequence: %@", percentEncoded]; + } + } + @catch (NSException *e) { + return nil; + } + + return [NSString stringWithCharacters:decoded length:output]; +} + /** * Given the original text1, and an encoded NSString which describes the * operations required to transform text1 into text2, compute the full diff. @@ -1369,7 +1490,7 @@ - (NSMutableArray *)diff_fromDeltaWithText:(NSString *)text1 NSString *param = [token substringFromIndex:1]; switch ([token characterAtIndex:0]) { case '+': - param = [param diff_stringByReplacingPercentEscapesForEncodeUriCompatibility]; + param = [self diff_decodeURIWithText:param]; if (param == nil) { if (error != NULL) { errorDetail = [NSDictionary dictionaryWithObjectsAndKeys: From 9a741a49445fb72f8ab892678cd0f97a9b58ecbf Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 19 Dec 2019 12:24:58 -0700 Subject: [PATCH 26/34] JavaScript: Fix known-broken diffs from objective-c --- javascript/diff_match_patch_uncompressed.js | 8 +++++++- javascript/tests/diff_match_patch_test.js | 9 +++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/javascript/diff_match_patch_uncompressed.js b/javascript/diff_match_patch_uncompressed.js index 67bee74..2b93ecc 100644 --- a/javascript/diff_match_patch_uncompressed.js +++ b/javascript/diff_match_patch_uncompressed.js @@ -1512,7 +1512,13 @@ diff_match_patch.prototype.decodeURI = function(text) { throw new URIError('URI malformed'); } - return decoded; + // some objective-c versions of the library produced patches with + // (null) in the place where surrogates were split across diff + // boundaries. if we leave those in we'll be stuck with a + // high-surrogate (null) low-surrogate pattern that will break + // deeper in the library or consumping application. we'll "fix" + // these by dropping the (null) and re-joining the surrogate halves + return decoded.replace(/([\uD800-\uDBFF])\(null\)([\uDC00-\uDFFF])/g, "$1$2"); } }; diff --git a/javascript/tests/diff_match_patch_test.js b/javascript/tests/diff_match_patch_test.js index 35d9de4..9ea1c6e 100644 --- a/javascript/tests/diff_match_patch_test.js +++ b/javascript/tests/diff_match_patch_test.js @@ -605,6 +605,15 @@ function testDiffDelta() { assertEquals('Swap surrogate pair', 'crashed'); } + try { + assertEquivalent( + dmp.diff_fromDelta('', '+%ED%A0%BC%28null%29%ED%B5%B0'), + [[DIFF_INSERT, '\ud83c\udd70']] + ); + } catch ( e ) { + assertEquals('Invalid diff from objective-c with (null) string' ); + } + // Empty diff groups assertEquivalent( dmp.diff_toDelta([[DIFF_EQUAL, 'abcdef'], [DIFF_DELETE, ''], [DIFF_INSERT, 'ghijk']]), From 8e5241bf1787a214c1442beb6d73cd1f741416b6 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 19 Dec 2019 12:47:12 -0700 Subject: [PATCH 27/34] Objective-C: Update from feedback - idiomatic code and bounds checks --- objectivec/DiffMatchPatch.m | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/objectivec/DiffMatchPatch.m b/objectivec/DiffMatchPatch.m index 486c0b3..2d1befa 100755 --- a/objectivec/DiffMatchPatch.m +++ b/objectivec/DiffMatchPatch.m @@ -1368,9 +1368,15 @@ - (int)diff_digit16:(unichar)c - (NSString *)diff_decodeURIWithText:(NSString *)percentEncoded { - unichar decoded[[percentEncoded length]]; - int input = 0; - int output = 0; + NSInteger inputLength = [percentEncoded length]; + + if (0 == inputLength) { + return @""; + } + + unichar decoded[inputLength]; + NSInteger input = 0; + NSInteger output = 0; @try { while (input < [percentEncoded length]) { @@ -1382,6 +1388,10 @@ - (NSString *)diff_decodeURIWithText:(NSString *)percentEncoded continue; } + if (inputLength < input + 3) { + return nil; + } + int byte1 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+1)]] << 4) + [self diff_digit16:[percentEncoded characterAtIndex:(input+2)]]; @@ -1391,15 +1401,15 @@ - (NSString *)diff_decodeURIWithText:(NSString *)percentEncoded continue; } - if ('%' != [percentEncoded characterAtIndex:(input + 3)]) { - [NSException raise:@"Invalid percent-encoded string" format:@"Cannot decode UTF-8 sequence: %@", percentEncoded]; + if (inputLength < input + 6 || '%' != [percentEncoded characterAtIndex:(input + 3)]) { + return nil; } int byte2 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+4)]] << 4) + [self diff_digit16:[percentEncoded characterAtIndex:(input+5)]]; if ((byte2 & 0xC0) != 0x80) { - [NSException raise:@"Invalid percent-encoded string" format:@"Cannot decode UTF-8 sequence: %@", percentEncoded]; + return nil; } byte2 = byte2 & 0x3F; @@ -1410,15 +1420,15 @@ - (NSString *)diff_decodeURIWithText:(NSString *)percentEncoded continue; } - if ('%' != [percentEncoded characterAtIndex:(input + 6)]) { - [NSException raise:@"Invalid percent-encoded string" format:@"Cannot decode UTF-8 sequence: %@", percentEncoded]; + if (inputLength < input + 9 || '%' != [percentEncoded characterAtIndex:(input + 6)]) { + return nil; } int byte3 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+7)]] << 4) + [self diff_digit16:[percentEncoded characterAtIndex:(input+8)]]; if ((byte3 & 0xC0) != 0x80) { - [NSException raise:@"Invalid percent-encoded string" format:@"Cannot decode UTF-8 sequence: %@", percentEncoded]; + return nil; } byte3 = byte3 & 0x3F; @@ -1429,15 +1439,15 @@ - (NSString *)diff_decodeURIWithText:(NSString *)percentEncoded continue; } - if ('%' != [percentEncoded characterAtIndex:(input + 9)]) { - [NSException raise:@"Invalid percent-encoded string" format:@"Cannot decode UTF-8 sequence: %@", percentEncoded]; + if (inputLength < input + 12 || '%' != [percentEncoded characterAtIndex:(input + 9)]) { + return nil; } int byte4 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+10)]] << 4) + [self diff_digit16:[percentEncoded characterAtIndex:(input+11)]]; if ((byte4 & 0xC0) != 0x80) { - [NSException raise:@"Invalid percent-encoded string" format:@"Cannot decode UTF-8 sequence: %@", percentEncoded]; + return nil; } byte4 = byte4 & 0x3F; @@ -1453,7 +1463,7 @@ - (NSString *)diff_decodeURIWithText:(NSString *)percentEncoded } } - [NSException raise:@"Invalid percent-encoded string" format:@"Cannot decode UTF-8 sequence: %@", percentEncoded]; + return nil; } } @catch (NSException *e) { From 3a00a83eb18b956e2171676981f8560f3f6b4be9 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 19 Dec 2019 13:25:04 -0700 Subject: [PATCH 28/34] ObjectiveC: Stop doing bounds checking and rely on exception-catching --- objectivec/DiffMatchPatch.m | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/objectivec/DiffMatchPatch.m b/objectivec/DiffMatchPatch.m index 2d1befa..62b1e3f 100755 --- a/objectivec/DiffMatchPatch.m +++ b/objectivec/DiffMatchPatch.m @@ -1368,13 +1368,7 @@ - (int)diff_digit16:(unichar)c - (NSString *)diff_decodeURIWithText:(NSString *)percentEncoded { - NSInteger inputLength = [percentEncoded length]; - - if (0 == inputLength) { - return @""; - } - - unichar decoded[inputLength]; + unichar decoded[[percentEncoded length]]; NSInteger input = 0; NSInteger output = 0; @@ -1388,10 +1382,6 @@ - (NSString *)diff_decodeURIWithText:(NSString *)percentEncoded continue; } - if (inputLength < input + 3) { - return nil; - } - int byte1 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+1)]] << 4) + [self diff_digit16:[percentEncoded characterAtIndex:(input+2)]]; @@ -1401,7 +1391,7 @@ - (NSString *)diff_decodeURIWithText:(NSString *)percentEncoded continue; } - if (inputLength < input + 6 || '%' != [percentEncoded characterAtIndex:(input + 3)]) { + if ('%' != [percentEncoded characterAtIndex:(input + 3)]) { return nil; } @@ -1420,7 +1410,7 @@ - (NSString *)diff_decodeURIWithText:(NSString *)percentEncoded continue; } - if (inputLength < input + 9 || '%' != [percentEncoded characterAtIndex:(input + 6)]) { + if ('%' != [percentEncoded characterAtIndex:(input + 6)]) { return nil; } @@ -1439,7 +1429,7 @@ - (NSString *)diff_decodeURIWithText:(NSString *)percentEncoded continue; } - if (inputLength < input + 12 || '%' != [percentEncoded characterAtIndex:(input + 9)]) { + if ('%' != [percentEncoded characterAtIndex:(input + 9)]) { return nil; } From 57c824653dbd76d8576b6cc6db5622cee4e053b1 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 19 Dec 2019 13:31:21 -0700 Subject: [PATCH 29/34] ObjectiveC: Use `uint16` and `uint32` for more specified bits --- objectivec/DiffMatchPatch.m | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/objectivec/DiffMatchPatch.m b/objectivec/DiffMatchPatch.m index 62b1e3f..926be4c 100755 --- a/objectivec/DiffMatchPatch.m +++ b/objectivec/DiffMatchPatch.m @@ -1342,7 +1342,7 @@ - (NSString *)diff_toDelta:(NSMutableArray *)diffs; return delta; } -- (int)diff_digit16:(unichar)c +- (NSInteger)diff_digit16:(unichar)c { switch (c) { case '0': return 0; @@ -1382,8 +1382,8 @@ - (NSString *)diff_decodeURIWithText:(NSString *)percentEncoded continue; } - int byte1 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+1)]] << 4) + - [self diff_digit16:[percentEncoded characterAtIndex:(input+2)]]; + uint16 byte1 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+1)]] << 4) + + [self diff_digit16:[percentEncoded characterAtIndex:(input+2)]]; if ((byte1 & 0x80) == 0) { decoded[output++] = byte1; @@ -1395,8 +1395,8 @@ - (NSString *)diff_decodeURIWithText:(NSString *)percentEncoded return nil; } - int byte2 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+4)]] << 4) + - [self diff_digit16:[percentEncoded characterAtIndex:(input+5)]]; + uint16 byte2 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+4)]] << 4) + + [self diff_digit16:[percentEncoded characterAtIndex:(input+5)]]; if ((byte2 & 0xC0) != 0x80) { return nil; @@ -1414,8 +1414,8 @@ - (NSString *)diff_decodeURIWithText:(NSString *)percentEncoded return nil; } - int byte3 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+7)]] << 4) + - [self diff_digit16:[percentEncoded characterAtIndex:(input+8)]]; + uint16 byte3 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+7)]] << 4) + + [self diff_digit16:[percentEncoded characterAtIndex:(input+8)]]; if ((byte3 & 0xC0) != 0x80) { return nil; @@ -1433,8 +1433,8 @@ - (NSString *)diff_decodeURIWithText:(NSString *)percentEncoded return nil; } - int byte4 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+10)]] << 4) + - [self diff_digit16:[percentEncoded characterAtIndex:(input+11)]]; + uint16 byte4 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+10)]] << 4) + + [self diff_digit16:[percentEncoded characterAtIndex:(input+11)]]; if ((byte4 & 0xC0) != 0x80) { return nil; @@ -1443,7 +1443,7 @@ - (NSString *)diff_decodeURIWithText:(NSString *)percentEncoded byte4 = byte4 & 0x3F; if ((byte1 & 0xF8) == 0xF0) { - int codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) | (byte3 << 0x06) | byte4; + uint32 codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) | (byte3 << 0x06) | byte4; if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) { codePoint -= 0x010000; decoded[output++] = ((codePoint >> 10) & 0x3FF) | 0xD800; From 9703fc4eb1dae364a5775dd0f8325ef41cc89236 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 19 Dec 2019 13:53:19 -0700 Subject: [PATCH 30/34] ObjectiveC: Use NSUInteger and explain some bit shifting --- objectivec/DiffMatchPatch.m | 53 ++++++++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 10 deletions(-) diff --git a/objectivec/DiffMatchPatch.m b/objectivec/DiffMatchPatch.m index 926be4c..98455af 100755 --- a/objectivec/DiffMatchPatch.m +++ b/objectivec/DiffMatchPatch.m @@ -1342,7 +1342,7 @@ - (NSString *)diff_toDelta:(NSMutableArray *)diffs; return delta; } -- (NSInteger)diff_digit16:(unichar)c +- (NSUInteger)diff_digit16:(unichar)c { switch (c) { case '0': return 0; @@ -1366,6 +1366,16 @@ - (NSInteger)diff_digit16:(unichar)c } } +/** +* Decode a percent-encoded UTF-8 string into a string of UTF-16 code units +* This is more permissive than `stringByRemovingPercentEncoding` because +* that fails if the input represents invalid Unicode characters. However, different +* diff-match-patch libraries may encode surrogate halves as if they were valid +* Unicode code points. Therefore, instead of failing or corrupting the output, which +* `stringByRemovingPercentEncoding` does when it inserts "(null)" in these places +* we can decode it anyway and then once the string is reconstructed from the diffs +* we'll end up with valid Unicode again, after the surrogate halves are re-joined +*/ - (NSString *)diff_decodeURIWithText:(NSString *)percentEncoded { unichar decoded[[percentEncoded length]]; @@ -1376,46 +1386,57 @@ - (NSString *)diff_decodeURIWithText:(NSString *)percentEncoded while (input < [percentEncoded length]) { unichar c = [percentEncoded characterAtIndex:input]; + // not special, so just return it if ('%' != c) { decoded[output++] = c; input += 1; continue; } - uint16 byte1 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+1)]] << 4) + - [self diff_digit16:[percentEncoded characterAtIndex:(input+2)]]; + NSUInteger byte1 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+1)]] << 4) + + [self diff_digit16:[percentEncoded characterAtIndex:(input+2)]]; + // single-byte UTF-8 first byte has bitmask 0xxx xxxx if ((byte1 & 0x80) == 0) { decoded[output++] = byte1; input += 3; continue; } + // at least one continuation byte if ('%' != [percentEncoded characterAtIndex:(input + 3)]) { return nil; } - uint16 byte2 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+4)]] << 4) + - [self diff_digit16:[percentEncoded characterAtIndex:(input+5)]]; + NSUInteger byte2 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+4)]] << 4) + + [self diff_digit16:[percentEncoded characterAtIndex:(input+5)]]; + // continuation bytes have bitmask 10xx xxxx if ((byte2 & 0xC0) != 0x80) { return nil; } + // continuation bytes thus only contribute six bits each + // these data bits are found with the bit mask xx11 1111 byte2 = byte2 & 0x3F; + // in two-byte sequences the first byte has bitmask 110x xxxx if ((byte1 & 0xE0) == 0xC0) { + // byte1 ___x xxxx << 6 + // byte2 __yy yyyy + // value x xxxxyy yyyy -> 11 bits decoded[output++] = ((byte1 & 0x1F) << 6) | byte2; input += 6; continue; } + // at least two continuation bytes if ('%' != [percentEncoded characterAtIndex:(input + 6)]) { return nil; } - uint16 byte3 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+7)]] << 4) + - [self diff_digit16:[percentEncoded characterAtIndex:(input+8)]]; + NSUInteger byte3 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+7)]] << 4) + + [self diff_digit16:[percentEncoded characterAtIndex:(input+8)]]; if ((byte3 & 0xC0) != 0x80) { return nil; @@ -1423,18 +1444,24 @@ - (NSString *)diff_decodeURIWithText:(NSString *)percentEncoded byte3 = byte3 & 0x3F; + // in three-byte sequences the first byte has bitmask 1110 xxxx if ((byte1 & 0xF0) == 0xE0) { + // byte1 ____ xxxx << 12 + // byte2 __yy yyyy << 6 + // byte3 __zz zzzz + // value xxxxyy yyyyzz zzzz -> 16 bits decoded[output++] = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3; input += 9; continue; } + // three continuation bytes if ('%' != [percentEncoded characterAtIndex:(input + 9)]) { return nil; } - uint16 byte4 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+10)]] << 4) + - [self diff_digit16:[percentEncoded characterAtIndex:(input+11)]]; + NSUInteger byte4 = ([self diff_digit16:[percentEncoded characterAtIndex:(input+10)]] << 4) + + [self diff_digit16:[percentEncoded characterAtIndex:(input+11)]]; if ((byte4 & 0xC0) != 0x80) { return nil; @@ -1442,8 +1469,14 @@ - (NSString *)diff_decodeURIWithText:(NSString *)percentEncoded byte4 = byte4 & 0x3F; + // in four-byte sequences the first byte has bitmask 1111 0xxx if ((byte1 & 0xF8) == 0xF0) { - uint32 codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) | (byte3 << 0x06) | byte4; + // byte1 ____ _xxx << 18 + // byte2 __yy yyyy << 12 + // byte3 __zz zzzz << 6 + // byte4 __tt tttt + // value xxxyy yyyyzz zzzztt tttt -> 21 bits + NSUInteger codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) | (byte3 << 0x06) | byte4; if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) { codePoint -= 0x010000; decoded[output++] = ((codePoint >> 10) & 0x3FF) | 0xD800; From 19606f083c2858bd5ee5d0f3585db5d310cc5104 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 19 Dec 2019 14:24:41 -0700 Subject: [PATCH 31/34] ObjectiveC: Guard against high-surrogate(null)low-surrogate --- objectivec/DiffMatchPatch.m | 20 ++++++++++++++++++-- objectivec/Tests/DiffMatchPatchTest.m | 4 ++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/objectivec/DiffMatchPatch.m b/objectivec/DiffMatchPatch.m index 98455af..ff1ba19 100755 --- a/objectivec/DiffMatchPatch.m +++ b/objectivec/DiffMatchPatch.m @@ -1492,8 +1492,24 @@ - (NSString *)diff_decodeURIWithText:(NSString *)percentEncoded @catch (NSException *e) { return nil; } - - return [NSString stringWithCharacters:decoded length:output]; + + // some objective-c versions of the library produced patches with + // (null) in the place where surrogates were split across diff + // boundaries. if we leave those in we'll be stuck with a + // high-surrogate (null) low-surrogate pattern that will break + // deeper in the library or consumping application. we'll "fix" + // these by dropping the (null) and re-joining the surrogate halves + NSString *result = [NSString stringWithCharacters:decoded length:output]; + NSRegularExpression *replacer = [NSRegularExpression + regularExpressionWithPattern:@"([\\x{D800}-\\x{DBFF}])\\(null\\)([\\x{DC00}-\\x{DFFF}])" + options:0 + error:nil]; + + return [replacer + stringByReplacingMatchesInString:result + options:0 + range:NSMakeRange(0, [result length]) + withTemplate:@"$1$2"]; } /** diff --git a/objectivec/Tests/DiffMatchPatchTest.m b/objectivec/Tests/DiffMatchPatchTest.m index 7e31508..6978908 100755 --- a/objectivec/Tests/DiffMatchPatchTest.m +++ b/objectivec/Tests/DiffMatchPatchTest.m @@ -814,6 +814,10 @@ - (void)test_diff_deltaTest { [Diff diffWithOperation:DIFF_INSERT andText:[NSString stringWithFormat:@"%C", 0xdd71]], nil])]); + // Invalid diff from objective-c with (null) string + XCTAssertEqualObjects([dmp diff_fromDeltaWithText:@"" andDelta:@"+%ED%A0%BC%28null%29%ED%B5%B0" error:nil], + ([NSMutableArray arrayWithObjects:[Diff diffWithOperation:DIFF_INSERT andText:@"πŸ…°"],nil])); + // Verify pool of unchanged characters. diffs = [NSMutableArray arrayWithObject: [Diff diffWithOperation:DIFF_INSERT andText:@"A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # "]]; From 477b5a669d396f68b5dc66e778cc54ff5026fa2e Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 19 Dec 2019 14:26:59 -0700 Subject: [PATCH 32/34] Fix typo --- javascript/diff_match_patch_uncompressed.js | 2 +- objectivec/DiffMatchPatch.m | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/javascript/diff_match_patch_uncompressed.js b/javascript/diff_match_patch_uncompressed.js index 2b93ecc..8101847 100644 --- a/javascript/diff_match_patch_uncompressed.js +++ b/javascript/diff_match_patch_uncompressed.js @@ -1516,7 +1516,7 @@ diff_match_patch.prototype.decodeURI = function(text) { // (null) in the place where surrogates were split across diff // boundaries. if we leave those in we'll be stuck with a // high-surrogate (null) low-surrogate pattern that will break - // deeper in the library or consumping application. we'll "fix" + // deeper in the library or consuming application. we'll "fix" // these by dropping the (null) and re-joining the surrogate halves return decoded.replace(/([\uD800-\uDBFF])\(null\)([\uDC00-\uDFFF])/g, "$1$2"); } diff --git a/objectivec/DiffMatchPatch.m b/objectivec/DiffMatchPatch.m index ff1ba19..0c56201 100755 --- a/objectivec/DiffMatchPatch.m +++ b/objectivec/DiffMatchPatch.m @@ -1497,7 +1497,7 @@ - (NSString *)diff_decodeURIWithText:(NSString *)percentEncoded // (null) in the place where surrogates were split across diff // boundaries. if we leave those in we'll be stuck with a // high-surrogate (null) low-surrogate pattern that will break - // deeper in the library or consumping application. we'll "fix" + // deeper in the library or consuming application. we'll "fix" // these by dropping the (null) and re-joining the surrogate halves NSString *result = [NSString stringWithCharacters:decoded length:output]; NSRegularExpression *replacer = [NSRegularExpression From fa122d3dfef5359f5af3535ed754083c6f16578e Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 19 Dec 2019 14:34:48 -0700 Subject: [PATCH 33/34] Java: Use StringBuildler in decodeURI and guard against H(null)L pattern --- .../fraser/neil/plaintext/diff_match_patch.java | 13 +++++++++++-- .../neil/plaintext/diff_match_patch_test.java | 6 ++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/java/src/name/fraser/neil/plaintext/diff_match_patch.java b/java/src/name/fraser/neil/plaintext/diff_match_patch.java index 9f3f71e..71e6480 100644 --- a/java/src/name/fraser/neil/plaintext/diff_match_patch.java +++ b/java/src/name/fraser/neil/plaintext/diff_match_patch.java @@ -1507,7 +1507,7 @@ private int digit16(char b) throws IllegalArgumentException { private String decodeURI(String text) throws IllegalArgumentException { int i = 0; - StringBuffer decoded = new StringBuffer(""); + StringBuilder decoded = new StringBuilder(text.length()); while (i < text.length()) { if (text.charAt(i) != '%') { @@ -1576,7 +1576,16 @@ private String decodeURI(String text) throws IllegalArgumentException { throw new IllegalArgumentException(); } - return decoded.toString(); + // some objective-c versions of the library produced patches with + // (null) in the place where surrogates were split across diff + // boundaries. if we leave those in we'll be stuck with a + // high-surrogate (null) low-surrogate pattern that will break + // deeper in the library or consuming application. we'll "fix" + // these by dropping the (null) and re-joining the surrogate halves + return decoded.toString().replaceAll( + "([\\uD800-\\uDBFF])\\(null\\)([\\uDC00-\\uDFFF])", + "$1$2" + ); } /** diff --git a/java/tests/name/fraser/neil/plaintext/diff_match_patch_test.java b/java/tests/name/fraser/neil/plaintext/diff_match_patch_test.java index 5be10f1..04aa860 100644 --- a/java/tests/name/fraser/neil/plaintext/diff_match_patch_test.java +++ b/java/tests/name/fraser/neil/plaintext/diff_match_patch_test.java @@ -460,6 +460,12 @@ public static void testDiffDelta() { dmp.diff_toDelta(dmp.diff_fromDelta("\ud83c\udd70", "=1\t-1\t+%ED%B5%B1")) ); + assertEquals( + "diff_fromDelta: Invalid diff from objective-c with (null) string", + diffList(new Diff(INSERT, "\ud83c\udd70")), + dmp.diff_fromDelta("", "+%ED%A0%BC%28null%29%ED%B5%B0") + ); + // Verify pool of unchanged characters. diffs = diffList(new Diff(INSERT, "A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # ")); String text2 = dmp.diff_text2(diffs); From 21aebb404b5af9050ceaeb518cf9a352688ce620 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Tue, 14 Jan 2020 20:10:18 -0700 Subject: [PATCH 34/34] Remove attempt at fixing the (null) bug --- .../name/fraser/neil/plaintext/diff_match_patch.java | 11 +---------- .../fraser/neil/plaintext/diff_match_patch_test.java | 6 ------ javascript/diff_match_patch.js | 10 +++++----- javascript/diff_match_patch_uncompressed.js | 9 +-------- javascript/tests/diff_match_patch_test.js | 11 +---------- objectivec/Tests/DiffMatchPatchTest.m | 4 ---- 6 files changed, 8 insertions(+), 43 deletions(-) diff --git a/java/src/name/fraser/neil/plaintext/diff_match_patch.java b/java/src/name/fraser/neil/plaintext/diff_match_patch.java index 71e6480..fe5b55d 100644 --- a/java/src/name/fraser/neil/plaintext/diff_match_patch.java +++ b/java/src/name/fraser/neil/plaintext/diff_match_patch.java @@ -1576,16 +1576,7 @@ private String decodeURI(String text) throws IllegalArgumentException { throw new IllegalArgumentException(); } - // some objective-c versions of the library produced patches with - // (null) in the place where surrogates were split across diff - // boundaries. if we leave those in we'll be stuck with a - // high-surrogate (null) low-surrogate pattern that will break - // deeper in the library or consuming application. we'll "fix" - // these by dropping the (null) and re-joining the surrogate halves - return decoded.toString().replaceAll( - "([\\uD800-\\uDBFF])\\(null\\)([\\uDC00-\\uDFFF])", - "$1$2" - ); + return decoded.toString(); } /** diff --git a/java/tests/name/fraser/neil/plaintext/diff_match_patch_test.java b/java/tests/name/fraser/neil/plaintext/diff_match_patch_test.java index 04aa860..5be10f1 100644 --- a/java/tests/name/fraser/neil/plaintext/diff_match_patch_test.java +++ b/java/tests/name/fraser/neil/plaintext/diff_match_patch_test.java @@ -460,12 +460,6 @@ public static void testDiffDelta() { dmp.diff_toDelta(dmp.diff_fromDelta("\ud83c\udd70", "=1\t-1\t+%ED%B5%B1")) ); - assertEquals( - "diff_fromDelta: Invalid diff from objective-c with (null) string", - diffList(new Diff(INSERT, "\ud83c\udd70")), - dmp.diff_fromDelta("", "+%ED%A0%BC%28null%29%ED%B5%B0") - ); - // Verify pool of unchanged characters. diffs = diffList(new Diff(INSERT, "A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # ")); String text2 = dmp.diff_text2(diffs); diff --git a/javascript/diff_match_patch.js b/javascript/diff_match_patch.js index 37bd784..8f33865 100644 --- a/javascript/diff_match_patch.js +++ b/javascript/diff_match_patch.js @@ -28,11 +28,11 @@ diff_match_patch.prototype.diff_xIndex=function(a,b){var d=0,c=0,e=0,f=0,g;for(g diff_match_patch.prototype.diff_prettyHtml=function(a){for(var b=[],d=/&/g,c=//g,f=/\n/g,g=0;g");switch(h){case DIFF_INSERT:b[g]=''+l+"";break;case DIFF_DELETE:b[g]=''+l+"";break;case DIFF_EQUAL:b[g]=""+l+""}}return b.join("")}; diff_match_patch.prototype.diff_text1=function(a){for(var b=[],d=0;d=a};diff_match_patch.prototype.isLowSurrogate=function(a){a=a.charCodeAt(0);return 56320<=a&&57343>=a}; -diff_match_patch.prototype.diff_toDelta=function(a){for(var b=[],d,c=0;c=c)){d+=String.fromCharCode((c&65535)>>>10&1023|55296);d+=String.fromCharCode(56320|c&1023);b+=12;continue}throw new URIError("URI malformed"); -}}}}return d}}; +diff_match_patch.prototype.diff_toDelta=function(a){for(var b=[],d,c=0;c=c)){d+=String.fromCharCode((c&65535)>>>10&1023|55296);d+=String.fromCharCode(56320|c&1023); +b+=12;continue}throw new URIError("URI malformed");}}}}return d}}; diff_match_patch.prototype.diff_fromDelta=function(a,b){for(var d=[],c=0,e=0,f=b.split(/\t/g),g=0;gthis.Match_MaxBits)throw Error("Pattern too long for this browser.");var e=this.match_alphabet_(b),f=this,g=this.Match_Threshold,h=a.indexOf(b,d);-1!=h&&(g=Math.min(c(0,h),g),h=a.lastIndexOf(b,d+b.length),-1!=h&&(g=Math.min(c(0,h),g)));var l=1<