User:Cacycle/diff.js: Difference between revisions

Content deleted Content added
1.0.9 (September 02, 2014) fix old IE compatibility of handler, newline sign to blank, newline sign for moved blocks, moved block gray a bit darker, trim moved group borders to first word or newline
1.0.10 (September 08, 2014) + .unique, optimize speed (CalculateDiff, ShortenOutput), ShortenOutput: paragraph max/min, symbols objects instead of .parsed
Line 3:
// ==UserScript==
// @name wDiff
// @version 1.0.910
// @date September 0308, 2014
// @description improved word-based diff library with block move detection
// @homepage https://en.wikipedia.org/wiki/User:Cacycle/diff
Line 22:
 
Additional features:
 
* Word (token) types have been optimized for MediaWiki source texts
* Resolution down to characters level
* Stepwise token size refinement, starting with paragraphs, then sentences, words, and finally characters
* Highlighting of moved blocks and their original position marks
* Stepwise split (paragraphs, sentences, words, chars)
* Recursive diff
* Additional post-pass-5 code for resolving islands caused by common tokens at the border of sequences of common tokens
* Block move detection and visualization
* Color coding of moved blocks and their marks at the original position
* Block detection minimizesMinimizing length of moved vs. static blocks
* Bubbling up of ambiguous unresolved regions to next line break
* Optional omission of unchanged irrelevant parts from the output
* Fully customizable
Line 51 ⟶ 56:
.number: list enumeration number
.parsed: token has been added to symbol table
.unique: unique token in whole text
.first: index of first token in tokens list
.last: index of last token in tokens list
Line 69 ⟶ 75:
.oldStart: old text token index of first token
.count number of tokens
.unique: contains unique matched token
.words: word count
.chars: char length
Line 81 ⟶ 88:
.blockStart: first block index
.blockEnd: last block index
.unique: contains unique matched token
.maxWords: word count of longest block
.words: word count
Line 125 ⟶ 133:
if (wDiff.letters === undefined) { wDiff.letters = 'a-zA-Z0-9' + '00AA00B500BA00C0-00D600D8-00F600F8-02C102C6-02D102E0-02E402EC02EE0370-037403760377037A-037D03860388-038A038C038E-03A103A3-03F503F7-0481048A-05270531-055605590561-058705D0-05EA05F0-05F20620-064A066E066F0671-06D306D506E506E606EE06EF06FA-06FC06FF07100712-072F074D-07A507B107CA-07EA07F407F507FA0800-0815081A082408280840-085808A008A2-08AC0904-0939093D09500958-09610971-09770979-097F0985-098C098F09900993-09A809AA-09B009B209B6-09B909BD09CE09DC09DD09DF-09E109F009F10A05-0A0A0A0F0A100A13-0A280A2A-0A300A320A330A350A360A380A390A59-0A5C0A5E0A72-0A740A85-0A8D0A8F-0A910A93-0AA80AAA-0AB00AB20AB30AB5-0AB90ABD0AD00AE00AE10B05-0B0C0B0F0B100B13-0B280B2A-0B300B320B330B35-0B390B3D0B5C0B5D0B5F-0B610B710B830B85-0B8A0B8E-0B900B92-0B950B990B9A0B9C0B9E0B9F0BA30BA40BA8-0BAA0BAE-0BB90BD00C05-0C0C0C0E-0C100C12-0C280C2A-0C330C35-0C390C3D0C580C590C600C610C85-0C8C0C8E-0C900C92-0CA80CAA-0CB30CB5-0CB90CBD0CDE0CE00CE10CF10CF20D05-0D0C0D0E-0D100D12-0D3A0D3D0D4E0D600D610D7A-0D7F0D85-0D960D9A-0DB10DB3-0DBB0DBD0DC0-0DC60E01-0E300E320E330E40-0E460E810E820E840E870E880E8A0E8D0E94-0E970E99-0E9F0EA1-0EA30EA50EA70EAA0EAB0EAD-0EB00EB20EB30EBD0EC0-0EC40EC60EDC-0EDF0F000F40-0F470F49-0F6C0F88-0F8C1000-102A103F1050-1055105A-105D106110651066106E-10701075-1081108E10A0-10C510C710CD10D0-10FA10FC-1248124A-124D1250-12561258125A-125D1260-1288128A-128D1290-12B012B2-12B512B8-12BE12C012C2-12C512C8-12D612D8-13101312-13151318-135A1380-138F13A0-13F41401-166C166F-167F1681-169A16A0-16EA1700-170C170E-17111720-17311740-17511760-176C176E-17701780-17B317D717DC1820-18771880-18A818AA18B0-18F51900-191C1950-196D1970-19741980-19AB19C1-19C71A00-1A161A20-1A541AA71B05-1B331B45-1B4B1B83-1BA01BAE1BAF1BBA-1BE51C00-1C231C4D-1C4F1C5A-1C7D1CE9-1CEC1CEE-1CF11CF51CF61D00-1DBF1E00-1F151F18-1F1D1F20-1F451F48-1F4D1F50-1F571F591F5B1F5D1F5F-1F7D1F80-1FB41FB6-1FBC1FBE1FC2-1FC41FC6-1FCC1FD0-1FD31FD6-1FDB1FE0-1FEC1FF2-1FF41FF6-1FFC2071207F2090-209C21022107210A-211321152119-211D212421262128212A-212D212F-2139213C-213F2145-2149214E218321842C00-2C2E2C30-2C5E2C60-2CE42CEB-2CEE2CF22CF32D00-2D252D272D2D2D30-2D672D6F2D80-2D962DA0-2DA62DA8-2DAE2DB0-2DB62DB8-2DBE2DC0-2DC62DC8-2DCE2DD0-2DD62DD8-2DDE2E2F300530063031-3035303B303C3041-3096309D-309F30A1-30FA30FC-30FF3105-312D3131-318E31A0-31BA31F0-31FF3400-4DB54E00-9FCCA000-A48CA4D0-A4FDA500-A60CA610-A61FA62AA62BA640-A66EA67F-A697A6A0-A6E5A717-A71FA722-A788A78B-A78EA790-A793A7A0-A7AAA7F8-A801A803-A805A807-A80AA80C-A822A840-A873A882-A8B3A8F2-A8F7A8FBA90A-A925A930-A946A960-A97CA984-A9B2A9CFAA00-AA28AA40-AA42AA44-AA4BAA60-AA76AA7AAA80-AAAFAAB1AAB5AAB6AAB9-AABDAAC0AAC2AADB-AADDAAE0-AAEAAAF2-AAF4AB01-AB06AB09-AB0EAB11-AB16AB20-AB26AB28-AB2EABC0-ABE2AC00-D7A3D7B0-D7C6D7CB-D7FBF900-FA6DFA70-FAD9FB00-FB06FB13-FB17FB1DFB1F-FB28FB2A-FB36FB38-FB3CFB3EFB40FB41FB43FB44FB46-FBB1FBD3-FD3DFD50-FD8FFD92-FDC7FDF0-FDFBFE70-FE74FE76-FEFCFF21-FF3AFF41-FF5AFF66-FFBEFFC2-FFC7FFCA-FFCFFFD2-FFD7FFDA-FFDC'.replace(/(\w{4})/g, '\\u$1'); }
 
// regExpregExps for splitting into paragraphs after newlinetext
if (wDiff.regExpParagraphregExpSplit === undefined) { wDiff.regExpParagraph = new RegExp('(.|\\n)+?(\\n|$)', 'g'); }
wDiff.regExpSplit = {
// after newline
paragraph: new RegExp('(.|\\n)+?(\\n|$)', 'g'),
// after .spaces or before newline
sentence: new RegExp('\\n|.*?\\.( +|(?=\\n))|.+?(?=\\n)', 'g'),
// words, multi-char markup, and chars
word: new RegExp('([' + wDiff.letters + '])+|\\[\\[|\\]\\]|\\{\\{|\\}\\}|&\\w+;|\'\'\'|\'\'|==+|\\{\\||\\|\\}|\\|-|.', 'g'),
 
// chars
// regExp for splitting into sentences after .spaces or before newline
character: new RegExp('.', 'g')
if (wDiff.regExpSentence === undefined) { wDiff.regExpSentence = new RegExp('\\n|.*?\\.( +|(?=\\n))|.+?(?=\\n)', 'g'); }
};
 
}
// regExp for splitting into words, multi-char markup, and chars
if (wDiff.regExpWord === undefined) { wDiff.regExpWord = new RegExp('([' + wDiff.letters + '])+|\\[\\[|\\]\\]|\\{\\{|\\}\\}|&\\w+;|\'\'\'|\'\'|==+|\\{\\||\\|\\}|\\|-|.', 'g'); }
 
// regExp for splitting into chars
if (wDiff.regExpChar === undefined) { wDiff.regExpChar = new RegExp('.', 'g'); }
 
// regExps for bubbling up gaps
Line 143 ⟶ 157:
// regExp for counting words
if (wDiff.regExpWordCount === undefined) { wDiff.regExpWordCount = new RegExp('(^|[^' + wDiff.letters + '])[' + wDiff.letters + '][' + wDiff.letters + '_\'’]*', 'g'); }
 
// regExp for wiki code non-letter characters
if (wDiff.regExpWikiCodeChars === undefined) { wDiff.regExpWikiCodeChars = /^[ \t\n\[\]{}|+\-!*#:;=<>'\/_,.&?]+$/; }
 
//
Line 149 ⟶ 166:
 
// characters before diff tag to search for previous heading, paragraph, line break, cut characters
if (wDiff.headingBefore === undefined) { wDiff.headingBefore = 1500; }
if (wDiff.paragraphBeforeparagraphBeforeMax === undefined) { wDiff.paragraphBeforeparagraphBeforeMax = 1500; }
if (wDiff.lineBeforeMax paragraphBeforeMin === undefined) { wDiff.lineBeforeMaxparagraphBeforeMin = = 1000500; }
if (wDiff.lineBeforeMinlineBeforeMax === undefined) { wDiff.lineBeforeMinlineBeforeMax = 500 = 1000; }
if (wDiff.blankBeforeMaxlineBeforeMin === undefined) { wDiff.blankBeforeMaxlineBeforeMin = 1000 500; }
if (wDiff.blankBeforeMinblankBeforeMax === undefined) { wDiff.blankBeforeMinblankBeforeMax = 500 = 1000; }
if (wDiff.charsBeforeblankBeforeMin === undefined) { wDiff.charsBeforeblankBeforeMin = 500; }
if (wDiff.charsBefore === undefined) { wDiff.charsBefore = 500; }
 
// characters after diff tag to search for next heading, paragraph, line break, or characters
if (wDiff.headingAfter === undefined) { wDiff.headingAfter = 1500; }
if (wDiff.paragraphAfterparagraphAfterMax === undefined) { wDiff.paragraphAfterparagraphAfterMax = 1500; }
if (wDiff.lineAfterMax paragraphAfterMin === undefined) { wDiff.lineAfterMaxparagraphAfterMin = = 1000500; }
if (wDiff.lineAfterMinlineAfterMax === undefined) { wDiff.lineAfterMinlineAfterMax = 500 = 1000; }
if (wDiff.blankAfterMaxlineAfterMin === undefined) { wDiff.blankAfterMaxlineAfterMin = 1000 500; }
if (wDiff.blankAfterMinblankAfterMax === undefined) { wDiff.blankAfterMinblankAfterMax = 500 = 1000; }
if (wDiff.charsAfterblankAfterMin === undefined) { wDiff.charsAfterblankAfterMin = 500; }
if (wDiff.charsAfter === undefined) { wDiff.charsAfter = 500; }
 
// lines before and after diff tag to search for previous heading, paragraph, line break, cut characters
Line 171 ⟶ 190:
 
// maximal fragment distance to join close fragments
if (wDiff.fragmentJoinLines === undefined) { wDiff.fragmentJoinLines = 105; }
if (wDiff.fragmentJoinChars === undefined) { wDiff.fragmentJoinChars = 1000; }
 
Line 459 ⟶ 478:
return text.diff;
}
// new symbols object
var symbols = {
token: [],
hash: {}
};
 
// split new and old text into paragraps
wDiff.Split(text.newText, wDiff.regExpParagraph'paragraph');
wDiff.Split(text.oldText, wDiff.regExpParagraph'paragraph');
 
// calculate diff
wDiff.CalculateDiff(text, symbols, 'paragraph');
 
// refine different paragraphs into sentences
wDiff.SplitRefine(text.newText, wDiff.regExpSentence'sentence');
wDiff.SplitRefine(text.oldText, wDiff.regExpSentence'sentence');
 
// calculate refined diff
wDiff.CalculateDiff(text, symbols, 'sentence');
 
// refine different sentences into words
wDiff.SplitRefine(text.newText, wDiff.regExpWord'word');
wDiff.SplitRefine(text.oldText, wDiff.regExpWord'word');
 
// calculate refined diff information with recursion for unresolved gaps
wDiff.CalculateDiff(text, symbols, 'word', true);
 
// bubble up gaps
Line 490 ⟶ 515:
 
// calculate refined diff information with recursion for unresolved gaps
wDiff.CalculateDiff(text, symbols, 'character', true);
}
 
Line 518 ⟶ 543:
// called from: wDiff.Diff()
 
wDiff.Split = function (text, regExplevel, token) {
 
var prev = null;
Line 525 ⟶ 550:
var first = current;
var string = '';
 
// split full text or specified token
if (token === undefined) {
Line 539 ⟶ 564:
var number = 0;
var regExpMatch;
while ( (regExpMatch = regExpwDiff.regExpSplit[level].exec(string)) !== null) {
 
// insert current item, link to previous
Line 548 ⟶ 573:
link: null,
number: null,
parsed: false,
unique: false
};
number ++;
Line 833 ⟶ 859:
// refine different words into chars
else {
wDiff.Split(text.newText, wDiff.regExpChar'character', i);
wDiff.Split(text.oldText, wDiff.regExpChar'character', j);
}
 
Line 931 ⟶ 957:
 
// wDiff.CalculateDiff: calculate diff information, can be called repeatedly during refining
// input: text,: object containing text data and tokens; level: 'paragraph', 'sentence', 'word', or 'character'
// optionally for recursive calls: newStart, newEnd, oldStart, oldEnd (tokens list indexes), recursionLevel
// changes: text.oldText/newText.tokens[].link, links corresponding tokens from old and new text
Line 937 ⟶ 963:
// pass 1: parse new text into symbol table
// pass 2: parse old text into symbol table
// pass 3: connect unique matched tokens
// pass 4: connect adjacent identical tokens downwards
// pass 5: connect adjacent identical tokens upwards
Line 943 ⟶ 969:
// recursively diff still unresolved regions upwards
 
wDiff.CalculateDiff = function (text, symbols, level, recurse, newStart, newEnd, oldStart, oldEnd, recursionLevel) {
 
// symbol (token) data
var symbol = [];
var symbols = {};
 
// set defaults
Line 955 ⟶ 977:
if (oldEnd === undefined) { oldEnd = text.oldText.last; }
if (recursionLevel === undefined) { recursionLevel = 0; }
var linked = false;
 
// limit recursion depth
Line 969 ⟶ 992:
while ( (i !== null) && (text.newText.tokens[i] !== null) ) {
 
// add new entry to symbol table
// parse token only once during split refinement
ifvar (token = (text.newText.tokens[i].parsed === false) || (recursionLevel > 0) ) {token;
if (Object.prototype.hasOwnProperty.call(symbols.hash, token) === false) {
text.newText.tokens[i].parsed = true;
var current = symbols.token.length;
symbols.hash[token] = current;
symbols.token[current] = {
newCount: 1,
oldCount: 0,
newToken: i,
oldToken: null
};
}
 
// addor newupdate existing entry to symbol table
else {
var token = text.newText.tokens[i].token;
if (Object.prototype.hasOwnProperty.call(symbols, token) === false) {
var current = symbol.length;
symbols[token] = current;
symbol[current] = {
newCount: 1,
oldCount: 0,
newToken: i,
oldToken: null
};
}
 
// orincrement updatetoken existingcounter entryfor new text
var hashToArray = symbols.hash[token];
else {
symbols.token[hashToArray].newCount ++;
 
// increment token counter for new text
var hashToArray = symbols[token];
symbol[hashToArray].newCount ++;
}
}
 
Line 1,010 ⟶ 1,028:
while ( (j !== null) && (text.oldText.tokens[j] !== null) ) {
 
// add new entry to symbol table
// parse token only once during split refinement
ifvar (token = (text.oldText.tokens[j].parsed === false) || (recursionLevel > 0) ) {token;
if (Object.prototype.hasOwnProperty.call(symbols.hash, token) === false) {
text.oldText.tokens[j].parsed = true;
var current = symbols.token.length;
symbols.hash[token] = current;
symbols.token[current] = {
newCount: 0,
oldCount: 1,
newToken: null,
oldToken: j
};
}
 
// addor newupdate existing entry to symbol table
else {
var token = text.oldText.tokens[j].token;
if (Object.prototype.hasOwnProperty.call(symbols, token) === false) {
var current = symbol.length;
symbols[token] = current;
symbol[current] = {
newCount: 0,
oldCount: 1,
newToken: null,
oldToken: j
};
}
 
// orincrement updatetoken existingcounter entryfor old text
var hashToArray = symbols.hash[token];
else {
symbols.token[hashToArray].oldCount ++;
 
// incrementadd token counternumber for old text
var symbols.token[hashToArray].oldToken = symbols[token]j;
symbol[hashToArray].oldCount ++;
 
// add token number for old text
symbol[hashToArray].oldToken = j;
}
}
 
Line 1,051 ⟶ 1,064:
 
// cycle trough symbol array
for (var i = 0; i < symbolsymbols.token.length; i ++) {
 
// find tokens in the symbol table that occur only once in both versions
if ( (symbolsymbols.token[i].newCount == 1) && (symbolsymbols.token[i].oldCount == 1) ) {
var newToken = symbolsymbols.token[i].newToken;
var oldToken = symbolsymbols.token[i].oldToken;
 
// do not use spaces as unique markers
Line 1,065 ⟶ 1,078:
text.newText.tokens[newToken].link = oldToken;
text.oldText.tokens[oldToken].link = newToken;
linked = true;
if ( (level != 'character') && (recursionLevel === 0) ) {
text.newText.tokens[newToken].unique = true;
text.oldText.tokens[oldToken].unique = true;
}
}
}
Line 1,075 ⟶ 1,093:
 
// cycle trough new text tokens list
if (linked === true) {
var i = text.newText.first;
var i = text.newText.first;
while ( (i !== null) && (text.newText.tokens[i] !== null) ) {
var iNext = text.newText.tokens[i].next;
 
// find already connected pairs
var j = text.newText.tokens[i].link;
if (j !== null) {
var jNext = text.oldText.tokens[j].next;
 
// check if the following tokens are not yet connected
if ( (iNext !== null) && (jNext !== null) ) {
if ( (text.newText.tokens[iNext].link === null) && (text.oldText.tokens[jNext].link === null) ) {
 
// connect if the following tokens are the same
if (text.newText.tokens[iNext].token == text.oldText.tokens[jNext].token) {
text.newText.tokens[iNext].link = jNext;
text.oldText.tokens[jNext].link = iNext;
}
}
}
}
i = iNext;
}
i = iNext;
}
 
//
// pass 5: connect adjacent identical tokens upwards
//
 
// cycle trough new text tokens list
var i = text.newText.last;
while ( (i !== null) && (text.newText.tokens[i] !== null) ) {
var iNext = text.newText.tokens[i].prev;
 
// find already connected pairs
var j = text.newText.tokens[i].link;
if (j !== null) {
var jNext = text.oldText.tokens[j].prev;
 
// check if the preceeding tokens are not yet connected
if ( (iNext !== null) && (jNext !== null) ) {
if ( (text.newText.tokens[iNext].link === null) && (text.oldText.tokens[jNext].link === null) ) {
 
// connect if the preceeding tokens are the same
if (text.newText.tokens[iNext].token == text.oldText.tokens[jNext].token) {
text.newText.tokens[iNext].link = jNext;
text.oldText.tokens[jNext].link = iNext;
}
}
}
}
i = iNext;
}
i = iNext;
}
 
// refine by recursively diffing unresolved regions caused by addition of common tokens around sequences of common tokens, only at word level split
if ( (recurse === true) && (wDiff.recursiveDiff === true) ) {
 
//
// recursively diff still unresolved regions downwards
//
 
// cycle trough new text tokens list
var i = newStart;
var j = oldStart;
 
while ( (i !== null) && (text.newText.tokens[i] !== null) ) {
 
// get j from previous tokens match
var iPrev = text.newText.tokens[i].prev;
if (iPrev !== null) {
var jPrev = text.newText.tokens[iPrev].link;
if (jPrev !== null) {
j = text.oldText.tokens[jPrev].next;
}
}
}
 
// check for the start of an unresolved sequence
if ( (j !== null) && (text.oldText.tokens[j] !== null) && (text.newText.tokens[i].link === null) && (text.oldText.tokens[j].link === null) ) {
 
// determine the limits of of the unresolved new sequence
var iStart = i;
var iEnd = null;
var iLength = 0;
var iNext = i;
while ( (iNext !== null) && (text.newText.tokens[iNext].link === null) ) {
iEnd = iNext;
iLength ++;
if (iEnd == newEnd) {
break;
}
iNext = text.newText.tokens[iNext].next;
}
iNext = text.newText.tokens[iNext].next;
}
 
// determine the limits of of the unresolved old sequence
var jStart = j;
var jEnd = null;
var jLength = 0;
var jNext = j;
while ( (jNext !== null) && (text.oldText.tokens[jNext].link === null) ) {
jEnd = jNext;
jLength ++;
if (jEnd == oldEnd) {
break;
}
jNext = text.oldText.tokens[jNext].next;
}
jNext = text.oldText.tokens[jNext].next;
}
 
// recursively diff the unresolved sequence
if ( (iLength > 0) && (jLength > 0) ) {
if ( (iLength > 1) || (jLength > 1) ) {
if ( (iStart != newStart) || (iEnd != newEnd) || (jStart != oldStart) || (jEnd != oldEnd) ) {
// new symbols object for sub-region
wDiff.CalculateDiff(text, true, iStart, iEnd, jStart, jEnd, recursionLevel + 1);
}var symbolsRecurse = {
token: [],
hash: {}
};
wDiff.CalculateDiff(text, symbolsRecurse, level, true, iStart, iEnd, jStart, jEnd, recursionLevel + 1);
}
i = iEnd;
}
i = iEnd;
}
 
// next list element
if (i == newEnd) {
break;
}
i = text.newText.tokens[i].next;
}
i = text.newText.tokens[i].next;
}
 
//
// recursively diff still unresolved regions upwards
//
 
// cycle trough new text tokens list
var i = newEnd;
var j = oldEnd;
while ( (i !== null) && (text.newText.tokens[i] !== null) ) {
 
// get j from next matched tokens
var iPrev = text.newText.tokens[i].next;
if (iPrev !== null) {
var jPrev = text.newText.tokens[iPrev].link;
if (jPrev !== null) {
j = text.oldText.tokens[jPrev].prev;
}
}
}
 
// check for the start of an unresolved sequence
if ( (j !== null) && (text.oldText.tokens[j] !== null) && (text.newText.tokens[i].link === null) && (text.oldText.tokens[j].link === null) ) {
 
// determine the limits of of the unresolved new sequence
var iStart = null;
var iEnd = i;
var iLength = 0;
var iNext = i;
while ( (iNext !== null) && (text.newText.tokens[iNext].link === null) ) {
iStart = iNext;
iLength ++;
if (iStart == newStart) {
break;
}
iNext = text.newText.tokens[iNext].prev;
}
iNext = text.newText.tokens[iNext].prev;
// determine the limits of of the unresolved old sequence
}
var jStart = null;
 
var jEnd = j;
// determine the limits of of the unresolved old sequence
var jStartjLength = null0;
var jEndjNext = j;
while ( (jNext !== null) && (text.oldText.tokens[jNext].link === null) ) {
var jLength = 0;
var jNext jStart = jjNext;
jLength ++;
while ( (jNext !== null) && (text.oldText.tokens[jNext].link === null) ) {
if (jStart == oldStart) jNext;{
jLength ++ break;
}
if (jStart == oldStart) {
jNext = text.oldText.tokens[jNext].prev;
break;
}
jNext = text.oldText.tokens[jNext].prev;
}
 
// recursively diff the unresolved sequence
if ( (iLength > 0) && (jLength > 0) ) {
if ( (iLength > 1) || (jLength > 1) ) {
if ( (iStart != newStart) || (iEnd != newEnd) || (jStart != oldStart) || (jEnd != oldEnd) ) {
// new symbols object for sub-region
wDiff.CalculateDiff(text, true, iStart, iEnd, jStart, jEnd, recursionLevel + 1);
}var symbolsRecurse = {
token: [],
hash: {}
};
wDiff.CalculateDiff(text, symbolsRecurse, level, true, iStart, iEnd, jStart, jEnd, recursionLevel + 1);
}
i = iStart;
}
i = iStart;
}
 
// next list element
if (i == newStart) {
break;
}
i = text.newText.tokens[i].prev;
}
i = text.newText.tokens[i].prev;
}
}
Line 1,303 ⟶ 1,327:
// set longest sequence of increasing groups in sections as fixed (not moved)
wDiff.SetFixed(blocks, groups, sections);
 
// convert groups to insertions/deletions if maximal block length is too short
if ( (wDiff.blockMinLength > 0) && (wDiff.UnlinkBlocks(text, blocks, groups) === true) ) {
 
// repeat from start after conversion to insertions/deletions
wDiff.GetSameBlocks(text, blocks);
wDiff.GetSections(blocks, sections);
wDiff.GetGroups(blocks, groups);
wDiff.SetFixed(blocks, groups, sections);
}
 
// collect deletion ('del') blocks from old text
Line 1,319 ⟶ 1,333:
// position 'del' blocks into new text order
wDiff.PositionDelBlocks(blocks);
 
// sort blocks by new text token number and update groups
wDiff.SortBlocks(blocks, groups);
 
// convert groups to insertions/deletions if maximal block length is too short
if (wDiff.blockMinLength > 0) {
var unlinked = wDiff.UnlinkBlocks(text, blocks, groups);
 
// repeat from start after conversion
if (unlinked === true) {
wDiff.GetSameBlocks(text, blocks);
wDiff.GetSections(blocks, sections);
wDiff.GetGroups(blocks, groups);
wDiff.SetFixed(blocks, groups, sections);
wDiff.GetDelBlocks(text, blocks);
wDiff.PositionDelBlocks(blocks);
}
}
 
// collect insertion ('ins') blocks from new text
Line 1,369 ⟶ 1,402:
// detect matching blocks ('same')
var count = 0;
var unique = false;
var chars = 0;
var string = '';
Line 1,374 ⟶ 1,408:
var token = text.oldText.tokens[j].token;
count ++;
unique = unique || text.newText.tokens[i].unique;
chars += token.length;
string += token;
Line 1,388 ⟶ 1,423:
oldStart: jStart,
count: count,
unique: unique,
words: wDiff.WordCount(string),
chars: chars,
Line 1,485 ⟶ 1,521:
var words = wDiff.WordCount(blocks[block].string);
var maxWords = words;
var unique = false;
var chars = blocks[block].chars;
 
Line 1,500 ⟶ 1,537:
maxWords = blocks[i].words;
}
unique = unique || blocks[i].unique;
words += blocks[i].words;
chars += blocks[i].chars;
Line 1,525 ⟶ 1,563:
blockStart: groupStart,
blockEnd: groupEnd,
unique: unique,
maxWords: maxWords,
words: words,
Line 1,536 ⟶ 1,575:
block = groupEnd;
}
}
return;
};
 
 
// wDiff.UnlinkBlocks: convert 'same' blocks in groups of continuous old text blocks into 'ins'/'del' pairs if too short
// called from: DetectBlocks()
// changes: text.newText/oldText[].link
// returns: true if text tokens were unlinked
 
wDiff.UnlinkBlocks = function (text, blocks, groups) {
 
var unlinked = false;
 
// cycle through groups
for (var group = 0; group < groups.length; group ++) {
if (groups[group].fixed === false) {
var blockStart = groups[group].blockStart;
var blockEnd = groups[group].blockEnd;
 
// unlink whole group if no block is at least blockMinLength words long
if (groups[group].maxWords < wDiff.blockMinLength) {
for (var block = blockStart; block <= blockEnd; block ++) {
wDiff.UnlinkSingleBlock(blocks[block], text);
unlinked = true;
}
}
 
// unlink border blocks without words/blanks
else {
 
// unlink start blocks of 0 words/newlines
for (var block = blockStart; block <= blockEnd; block ++) {
if ( (blocks[block].words !== 0) || (blocks[block].string == '\n') ) {
break;
}
wDiff.UnlinkSingleBlock(blocks[block], text);
unlinked = true;
blockStart = block;
}
 
// unlink end blocks of 0 words/newlines
for (var block = blockEnd; block > blockStart; block --) {
if ( (blocks[block].words !== 0) || (blocks[block].string == '\n') ) {
break;
}
wDiff.UnlinkSingleBlock(blocks[block], text);
unlinked = true;
}
}
}
}
return unlinked;
};
 
 
// wDiff.UnlinkBlock: un-link text tokens of block, converting them into 'ins'/'del' pairs
// called from: wDiff.UnlinkBlocks()
// changes: text.newText/oldText[].link
 
wDiff.UnlinkSingleBlock = function (block, text) {
 
// cycle through old text
var j = block.oldStart;
for (var count = 0; count < block.count; count ++) {
 
// unlink tokens
text.newText.tokens[ text.oldText.tokens[j].link ].link = null;
text.oldText.tokens[j].link = null;
j = text.oldText.tokens[j].next;
}
return;
Line 1,738 ⟶ 1,707:
oldStart: oldStart,
count: count,
unique: false,
words: null,
chars: null,
Line 1,835 ⟶ 1,805:
delBlock.fixed = neighbor.fixed;
}
}
return;
};
 
 
// wDiff.UnlinkBlocks: convert 'same' blocks in groups into 'ins'/'del' pairs if too short
// called from: DetectBlocks()
// changes: text.newText/oldText[].link
// returns: true if text tokens were unlinked
 
wDiff.UnlinkBlocks = function (text, blocks, groups) {
 
var unlinked = false;
 
// cycle through groups
for (var group = 0; group < groups.length; group ++) {
var blockStart = groups[group].blockStart;
var blockEnd = groups[group].blockEnd;
// no block in group is at least blockMinLength words long
if (groups[group].maxWords < wDiff.blockMinLength) {
 
// unlink whole moved group if it contains no unique matched token
if ( (groups[group].fixed === false) && (groups[group].unique === false) ) {
for (var block = blockStart; block <= blockEnd; block ++) {
if (blocks[block].type == 'same') {
wDiff.UnlinkSingleBlock(blocks[block], text);
unlinked = true;
}
}
}
 
// unlink block flanks
else {
 
// unlink from start if preceded by 'del'
for (var block = blockStart; block <= blockEnd; block ++) {
if ( (block > 0) && (blocks[block - 1].type == 'del') && (blocks[block].type == 'same') ) {
// stop unlinking if more than one word or a unique word
if ( (blocks[block].words > 1) || ( (blocks[block].words == 1) && (blocks[block].unique === true) ) ) {
break;
}
wDiff.UnlinkSingleBlock(blocks[block], text);
unlinked = true;
blockStart = block;
}
}
 
// unlink from end if followed by 'del'
for (var block = blockEnd; block > blockStart; block --) {
if ( (blockEnd < blocks.length - 1) && (blocks[block + 1].type == 'del') && (blocks[block].type == 'same') ) {
// stop unlinking if more than one word or a unique word
if ( (blocks[block].words > 1) || ( (blocks[block].words == 1) && (blocks[block].unique === true) ) ) {
break;
}
wDiff.UnlinkSingleBlock(blocks[block], text);
unlinked = true;
}
}
}
}
}
return unlinked;
};
 
 
// wDiff.UnlinkBlock: un-link text tokens of single block, converting them into 'ins'/'del' pairs
// called from: wDiff.UnlinkBlocks()
// changes: text.newText/oldText[].link
 
wDiff.UnlinkSingleBlock = function (block, text) {
 
// cycle through old text
var j = block.oldStart;
for (var count = 0; count < block.count; count ++) {
 
// unlink tokens
text.newText.tokens[ text.oldText.tokens[j].link ].link = null;
text.oldText.tokens[j].link = null;
j = text.oldText.tokens[j].next;
}
return;
Line 1,874 ⟶ 1,926:
oldStart: null,
count: count,
unique: false,
words: null,
chars: null,
Line 1,948 ⟶ 2,001:
blockStart: block,
blockEnd: block,
unique: false,
maxWords: null,
words: null,
Line 2,118 ⟶ 2,172:
// cycle through groups
for (var group = 0; group < groups.length; group ++) {
var fixed = groups[group].fixed;
var color = groups[group].color;
var blockStart = groups[group].blockStart;
Line 2,358 ⟶ 2,411:
// scan for diff html tags
var regExpDiff = /<\w+\b[^>]*\bclass="[^">]*?\bwDiff(MarkLeft|MarkRight|BlockLeft|BlockRight|Delete|Insert)\b[^">]*"[^>]*>(.|\n)*?<!--wDiff\1-->/g;
var tagStarttagsStart = [];
var tagEndtagsEnd = [];
var i = 0;
var regExpMatch;
Line 2,367 ⟶ 2,420:
 
// combine consecutive diff tags
if ( (i > 0) && (tagEndtagsEnd[i - 1] == regExpMatch.index) ) {
tagEndtagsEnd[i - 1] = regExpMatch.index + regExpMatch[0].length;
}
else {
tagStarttagsStart[i] = regExpMatch.index;
tagEndtagsEnd[i] = regExpMatch.index + regExpMatch[0].length;
i ++;
}
Line 2,378 ⟶ 2,431:
 
// no diff tags detected
if (tagStarttagsStart.length === 0) {
return wDiff.htmlNoChange;
}
 
// define regexps
var regExpHeadingregExpLine = /^(\n=+|.+?=+ *\n)|(\n\{\+|.)$|\n\|\}+/g;
var regExpParagraphregExpHeading = /(^|\n)(<[^>]+>)*(==+.+?==+|\{\||\|\}).*?\n+?/g;
var regExpLineregExpParagraph = /^(\n\n+|.)|(\n\n+|.)$|\n\n+/g;
var regExpBlank = /(<[^>]+>)*\s+/g;
 
// get line positions
var regExpMatch;
var lines = [];
while ( (regExpMatch = regExpLine.exec(html)) !== null) {
lines.push(regExpMatch.index);
}
// get heading positions
var headings = [];
var headingsEnd = [];
while ( (regExpMatch = regExpHeading.exec(html)) !== null ) {
headings.push(regExpMatch.index);
headingsEnd.push(regExpMatch.index + regExpMatch[0].length);
}
 
// get paragraph positions
var paragraphs = [];
while ( (regExpMatch = regExpParagraph.exec(html)) !== null ) {
paragraphs.push(regExpMatch.index);
}
 
// determine fragment border positions around diff tags
var lineMaxBefore = 0;
var headingBefore = 0;
var paragraphBefore = 0;
var lineBefore = 0;
 
var lineMaxAfter = 0;
var headingAfter = 0;
var paragraphAfter = 0;
var lineAfter = 0;
 
var rangeStart = [];
var rangeEnd = [];
var rangeStartType = [];
var rangeEndType = [];
 
// get line break positions
var lineBreaks = [];
var pos = 0;
do {
lineBreaks.push(pos);
pos = html.indexOf('\n', pos + 1);
} while (pos != -1);
lineBreaks.push(html.length);
 
// cycle through diff tag start positions
for (var i = 0; i < tagStarttagsStart.length; i ++) {
var regExpMatchtagStart = tagsStart[i];
var tagEnd = tagsEnd[i];
 
// maximal lines to search before diff tag
var rangeStartMin = 0;
for (var j = 0lineMaxBefore; j < lineBreakslines.length - 1; j ++) {
if (tagStart[i] < lineBreakslines[j + 1]) {
if (j >=- wDiff.linesBeforeMax >= 0) {
rangeStartMin = lineBreakslines[j - wDiff.linesBeforeMax];
}
lineMaxBefore = j;
break;
}
Line 2,419 ⟶ 2,496:
 
// find last heading before diff tag
if (rangeStart[i] === undefined) {
var lastPos = tagStart[i] - wDiff.headingBefore;
for (var j = headingBefore; j < headings.length - 1; j ++) {
if (lastPos < rangeStartMin) {
if (headings[j + 1] > tagStart) {
lastPos = rangeStartMin;
if ( (headings[j] > tagStart - wDiff.headingBefore) && (headings[j] > rangeStartMin) ) {
}
rangeStart[i] = headings[j];
regExpHeading.lastIndex = lastPos;
rangeStartType[i] = 'heading';
while ( (regExpMatch = regExpHeading.exec(html)) !== null ) {
headingBefore = j;
if (regExpMatch.index > tagStart[i]) {
break; }
break;
}
}
rangeStart[i] = regExpMatch.index;
rangeStartType[i] = 'heading';
}
 
// find last paragraph before diff tag
if (rangeStart[i] === undefined) {
for (var j = paragraphBefore; j < paragraphs.length - 1; j ++) {
lastPos = tagStart[i] - wDiff.paragraphBefore;
if (paragraphs[j + 1] > tagStart - wDiff.paragraphBeforeMin) {
if (lastPos < rangeStartMin) {
if ( (paragraphs[j] > tagStart - wDiff.paragraphBeforeMax) && (paragraphs[j] > rangeStartMin) ) {
lastPos = rangeStartMin;
rangeStart[i] = paragraphs[j];
}
rangeStartType[i] = 'paragraph';
regExpParagraph.lastIndex = lastPos;
paragraphBefore = j;
while ( (regExpMatch = regExpParagraph.exec(html)) !== null) {
}
if (regExpMatch.index > tagStart[i]) {
break;
}
rangeStart[i] = regExpMatch.index;
rangeStartType[i] = 'paragraph';
}
}
Line 2,450 ⟶ 2,525:
// find last line break before diff tag
if (rangeStart[i] === undefined) {
for (var j = lineBefore; j < lines.length - 1; j ++) {
lastPos = tagStart[i] - wDiff.lineBeforeMax;
if (lastPoslines[j <+ rangeStartMin1] > tagStart - wDiff.lineBeforeMin) {
lastPos if =( (lines[j] > tagStart - wDiff.lineBeforeMax) && (lines[j] > rangeStartMin;) ) {
rangeStart[i] = lines[j];
}
regExpLine.lastIndex rangeStartType[i] = lastPos'line';
lineBefore = j;
while ( (regExpMatch = regExpLine.exec(html)) !== null ) {
}
if (regExpMatch.index > tagStart[i] - wDiff.lineBeforeMin) {
break;
}
rangeStart[i] = regExpMatch.index;
rangeStartType[i] = 'line';
}
}
Line 2,466 ⟶ 2,539:
// find last blank before diff tag
if (rangeStart[i] === undefined) {
var lastPos = tagStart[i] - wDiff.blankBeforeMax;
if (lastPos < rangeStartMin) {
lastPos = rangeStartMin;
Line 2,472 ⟶ 2,545:
regExpBlank.lastIndex = lastPos;
while ( (regExpMatch = regExpBlank.exec(html)) !== null ) {
if (regExpMatch.index > tagStart[i] - wDiff.blankBeforeMin) {
rangeStart[i] = lastPos;
rangeStartType[i] = 'blank';
break;
}
rangeStart[i]lastPos = regExpMatch.index;
rangeStartType[i] = 'blank';
}
}
Line 2,482 ⟶ 2,556:
// fixed number of chars before diff tag
if (rangeStart[i] === undefined) {
if (rangeStart[i]tagStart - wDiff.charsBefore > rangeStartMin) {
rangeStart[i] = tagStart[i] - wDiff.charsBefore;
rangeStartType[i] = 'chars';
}
Line 2,496 ⟶ 2,570:
// maximal lines to search after diff tag
var rangeEndMax = html.length;
for (var j = lineMaxAfter; j < lines.length; j ++) {
var pos = tagEnd[i];
if (lines[j] > tagEnd) {
for (var j = 0; j < wDiff.linesAfterMax; j ++) {
if (j + wDiff.linesAfterMax < lines.length) {
pos = html.indexOf('\n', pos + 1);
rangeEndMax = lines[j + wDiff.linesAfterMax];
if (pos == -1) {
}
rangeEndMax = html.length;
lineMaxAfter = j;
break;
}
rangeEndMax = pos;
}
 
// find first heading after diff tag
if (rangeEnd[i] === undefined) {
regExpHeading.lastIndex = tagEnd[i];
for (var j = headingAfter; j < headingsEnd.length; j ++) {
if ( (regExpMatch = regExpHeading.exec(html)) !== null ) {
if (headingsEnd[j] > tagEnd) {
if ( (regExpMatch.index < tagEnd[i] + wDiff.headingAfter) && (regExpMatch.index < rangeEndMax) ) {
if ( (headingsEnd[j] < tagEnd + wDiff.headingAfter) && (headingsEnd[j] < rangeEndMax) ) {
rangeEnd[i] = regExpMatch.index + regExpMatch[0].length;
rangeEndType rangeEnd[i] = 'heading'headingsEnd[j];
rangeEndType[i] = 'heading';
paragraphAfter = j;
}
break;
}
}
}
Line 2,517 ⟶ 2,596:
// find first paragraph after diff tag
if (rangeEnd[i] === undefined) {
for (var j = paragraphAfter; j < paragraphs.length; j ++) {
regExpParagraph.lastIndex = tagEnd[i];
if (paragraphs[j] > tagEnd + wDiff.paragraphAfterMin) {
if ( (regExpMatch = regExpParagraph.exec(html)) !== null ) {
if ( (regExpMatch.indexparagraphs[j] < tagEnd[i] + wDiff.paragraphAfterparagraphAfterMax) && (regExpMatch.indexparagraphs[j] < rangeEndMax) ) {
rangeEnd[i] = regExpMatch.indexparagraphs[j];
rangeEndType[i] = 'paragraph';
paragraphAfter = j;
}
break;
}
}
Line 2,528 ⟶ 2,610:
// find first line break after diff tag
if (rangeEnd[i] === undefined) {
for (var j = lineAfter; j < lines.length; j ++) {
regExpLine.lastIndex = tagEnd[i] + wDiff.lineAfterMin;
if (lines[j] > tagEnd + wDiff.lineAfterMin) {
if ( (regExpMatch = regExpLine.exec(html)) !== null ) {
if ( (regExpMatch.indexlines[j] < tagEnd[i] + wDiff.lineAfterMax) && (regExpMatch.indexlines[j] < rangeEndMax) ) {
rangeEnd[i] = regExpMatch.indexlines[j];
rangeEndType[i] = 'breakline';
lineAfter = j;
}
break;
}
}
}
 
 
// find blank after diff tag
if (rangeEnd[i] === undefined) {
regExpBlank.lastIndex = tagEnd[i] + wDiff.blankAfterMin;
if ( (regExpMatch = regExpBlank.exec(html)) !== null ) {
if ( (regExpMatch.index < tagEnd[i] + wDiff.blankAfterMax) && (regExpMatch.index < rangeEndMax) ) {
rangeEnd[i] = regExpMatch.index;
rangeEndType[i] = 'blank';
Line 2,551 ⟶ 2,635:
// fixed number of chars after diff tag
if (rangeEnd[i] === undefined) {
if (rangeEnd[i]tagEnd + wDiff.charsAfter < rangeEndMax) {
rangeEnd[i] = tagEnd[i] + wDiff.charsAfter;
rangeEndType[i] = 'chars';
}
Line 2,711 ⟶ 2,795:
 
wDiff.DebugBlocks = function (blocks) {
var dump = '\ni \toldBl \tnewBl \toldNm \tnewNm \toldSt \tcount \tuniq \twords \tchars \ttype \tsect \tgroup \tfixed \tstring\n';
for (var i = 0; i < blocks.length; i ++) {
dump += i + ' \t' + blocks[i].oldBlock + ' \t' + blocks[i].newBlock + ' \t' + blocks[i].oldNumber + ' \t' + blocks[i].newNumber + ' \t' + blocks[i].oldStart + ' \t' + blocks[i].count + ' \t' + blocks[i].unique + ' \t' + blocks[i].words + ' \t' + blocks[i].chars + ' \t' + blocks[i].type + ' \t' + blocks[i].section + ' \t' + blocks[i].group + ' \t' + blocks[i].fixed + ' \t' + wDiff.DebugShortenString(blocks[i].string) + '\n';
}
return dump;
Line 2,724 ⟶ 2,808:
 
wDiff.DebugGroups = function (groups) {
var dump = '\ni \tblSta \tblEnd \tuniq \tmaxWo \twords \tchars \tfixed \toldNm \tmFrom \tcolor \tmoved \tdiff\n';
for (var i = 0; i < groups.length; i ++) {
dump += i + ' \t' + groups[i].blockStart + ' \t' + groups[i].blockEnd + ' \t' + groups[i].unique + ' \t' + groups[i].maxWords + ' \t' + groups[i].words + ' \t' + groups[i].chars + ' \t' + groups[i].fixed + ' \t' + groups[i].oldNumber + ' \t' + groups[i].movedFrom + ' \t' + groups[i].color + ' \t' + groups[i].moved.toString() + ' \t' + wDiff.DebugShortenString(groups[i].diff) + '\n';
}
return dump;