User:Cacycle/diff.js: Difference between revisions

Content deleted Content added
1.0.1 (August 24, 2014) +bubbling up of ambiguous gaps, make settings configurable
1.0.2 (August 24, 2014) text.new, text.old -> text.newText, text.oldText for MS IE 8 compatibility
Line 2:
 
// @name wDiff
// @version 1.0.12
// @date August 26, 2014
// @description improved word-based diff library with block move detection
Line 39:
 
text: objects for text related data
.newnewText, new text
.oldoldText: old text
.string: new or old text to be diffed
.tokens[]: token data list for new or old string (N and O)
Line 57:
.newCount: new text token counter (NC)
.oldCount: old text token counter (OC)
.newToken: token index in text.newnewText.tokens
.oldToken: token index in text.oldoldText.tokens
 
blocks[]: array of objects that holds block (consecutive text tokens) data in order of the new text
Line 277:
// prepare text data object
var text = {
newnewText: {
string: newString,
tokens: [],
Line 283:
last: null
},
oldoldText: {
string: oldString,
tokens: [],
Line 314:
 
// split new and old text into paragraps
wDiff.Split(text.newnewText, wDiff.regExpParagraph);
wDiff.Split(text.oldoldText, wDiff.regExpParagraph);
 
// calculate diff
Line 321:
 
// refine different paragraphs into sentences
wDiff.SplitRefine(text.newnewText, wDiff.regExpSentence);
wDiff.SplitRefine(text.oldoldText, wDiff.regExpSentence);
 
// calculate refined diff
Line 328:
 
// refine different sentences into words
wDiff.SplitRefine(text.newnewText, wDiff.regExpWord);
wDiff.SplitRefine(text.oldoldText, wDiff.regExpWord);
 
// calculate refined diff information with recursion for unresolved gaps
Line 335:
 
// bubble up gaps
wDiff.BubbleUpGaps(text.newnewText, text.oldoldText);
wDiff.BubbleUpGaps(text.oldoldText, text.newnewText);
 
// split tokens into chars in selected unresolved gaps
Line 347:
 
// bubble up gaps
wDiff.BubbleUpGaps(text.newnewText, text.oldoldText);
wDiff.BubbleUpGaps(text.oldoldText, text.newnewText);
 
// enumerate tokens lists
wDiff.EnumerateTokens(text.newnewText);
wDiff.EnumerateTokens(text.oldoldText);
 
// detect moved blocks
Line 367:
 
// wDiff.Split: split text into paragraph, sentence, or word tokens
// input: text (text.newnewText or text.oldoldText), object containing text data and strings; regExp, regular expression for splitting text into tokens; token, tokens index of token to be split
// changes: text (text.newnewText or text.oldoldText): text.tokens list, text.first, text.last
// called from: wDiff.Diff()
 
Line 447:
 
// wDiff.SplitRefine: split unique unmatched tokens into smaller tokens
// changes: text (text.newnewText or text.oldoldText) .tokens list
// called from: wDiff.Diff()
// calls: wDiff.Split()
Line 476:
// - same length and at least 50 % identity
// identical tokens including space separators will be linked, resulting in word-wise char-level diffs
// changes: text (text.newnewText or text.oldoldText) .tokens list
// called from: wDiff.Diff()
// calls: wDiff.Split()
Line 493:
var gaps = [];
var gap = null;
var i = text.newnewText.first;
var j = text.oldoldText.first;
while ( (i != null) && (text.newnewText.tokens[i] != null) ) {
 
// get list item properties
var newLink = text.newnewText.tokens[i].link;
var oldLink = null;
if (j != null) {
oldLink = text.oldoldText.tokens[j].link;
}
 
Line 531:
// next list elements
if (newLink != null) {
j = text.oldoldText.tokens[newLink].next;
}
i = text.newnewText.tokens[i].next;
}
 
Line 541:
// cycle trough old text tokens list
var j = gaps[gap].oldFirst;
while ( (j != null) && (text.oldoldText.tokens[j] != null) && (text.oldoldText.tokens[j].link == null) ) {
 
// count old chars and tokens in gap
Line 547:
gaps[gap].oldTokens ++;
 
j = text.oldoldText.tokens[j].next;
}
}
Line 563:
// one word became separated by space, dash, or any string
if ( (gaps[gap].newTokens == 1) && (gaps[gap].oldTokens == 3) ) {
if (text.newnewText.tokens[ gaps[gap].newFirst ].token != text.oldoldText.tokens[ gaps[gap].oldFirst ].token + text.oldoldText.tokens[ gaps[gap].oldLast ].token ) {
continue;
}
}
else if ( (gaps[gap].oldTokens == 1) && (gaps[gap].newTokens == 3) ) {
if (text.oldoldText.tokens[ gaps[gap].oldFirst ].token != text.newnewText.tokens[ gaps[gap].newFirst ].token + text.newnewText.tokens[ gaps[gap].newLast ].token ) {
continue;
}
Line 581:
var j = gaps[gap].oldFirst;
while (i != null) {
var newToken = text.newnewText.tokens[i].token;
var oldToken = text.oldoldText.tokens[j].token;
 
// get shorter and longer token
Line 658:
break;
}
i = text.newnewText.tokens[i].next;
j = text.oldoldText.tokens[j].next;
}
gaps[gap].charSplit = charSplit;
Line 675:
var j = gaps[gap].oldFirst;
while (i != null) {
var newToken = text.newnewText.tokens[i].token;
var oldToken = text.oldoldText.tokens[j].token;
 
// link identical tokens (spaces)
if (newToken == oldToken) {
text.newnewText.tokens[i].link = j;
text.oldoldText.tokens[j].link = i;
}
 
// refine different words into chars
else {
wDiff.Split(text.newnewText, wDiff.regExpChar, i);
wDiff.Split(text.oldoldText, wDiff.regExpChar, j);
}
 
Line 694:
break;
}
i = text.newnewText.tokens[i].next;
j = text.oldoldText.tokens[j].next;
}
}
Line 708:
// wDiff.BubbleUpGaps: move gaps with ambiguous identical fronts and backs up
// start ambiguous gap borders after line breaks and text section closing characters
// changes: text (text.newnewText or text.oldoldText) .tokens list
// called from: wDiff.Diff()
 
Line 766:
 
// wDiff.EnumerateTokens: enumerate text token list
// changes: text (text.newnewText or text.oldoldText) .tokens list
// called from: wDiff.Diff()
 
Line 786:
// input: text, object containing text data and tokens
// optionally for recursive calls: newStart, newEnd, oldStart, oldEnd (tokens list indexes), recursionLevel
// changes: text.oldoldText/newnewText.tokens[].link, links corresponding tokens from old and new text
// steps:
// pass 1: parse new text into symbol table
Line 803:
 
// set defaults
if (typeof newStart == 'undefined') { newStart = text.newnewText.first; }
if (typeof newEnd == 'undefined') { newEnd = text.newnewText.last; }
if (typeof oldStart == 'undefined') { oldStart = text.oldoldText.first; }
if (typeof oldEnd == 'undefined') { oldEnd = text.oldoldText.last; }
if (typeof recursionLevel == 'undefined') { recursionLevel = 0; }
 
Line 820:
// cycle trough new text tokens list
var i = newStart;
while ( (i != null) && (text.newnewText.tokens[i] != null) ) {
 
// parse token only once during split refinement
if ( (text.newnewText.tokens[i].parsed == false) || (recursionLevel > 0) ) {
text.newnewText.tokens[i].parsed = true;
 
// add new entry to symbol table
var token = text.newnewText.tokens[i].token;
if (Object.prototype.hasOwnProperty.call(symbols, token) == false) {
var current = symbol.length;
Line 852:
break;
}
i = text.newnewText.tokens[i].next;
}
 
Line 861:
// cycle trough old text tokens list
var j = oldStart;
while ( (j != null) && (text.oldoldText.tokens[j] != null) ) {
 
// parse token only once during split refinement
if ( (text.oldoldText.tokens[j].parsed == false) || (recursionLevel > 0) ) {
text.oldoldText.tokens[j].parsed = true;
 
// add new entry to symbol table
var token = text.oldoldText.tokens[j].token;
if (Object.prototype.hasOwnProperty.call(symbols, token) == false) {
var current = symbol.length;
Line 896:
break;
}
j = text.oldoldText.tokens[j].next;
}
 
Line 912:
 
// do not use spaces as unique markers
if (/^\s+$/.test(text.newnewText.tokens[newToken].token) == false) {
 
// connect from new to old and from old to new
if (text.newnewText.tokens[newToken].link == null) {
text.newnewText.tokens[newToken].link = oldToken;
text.oldoldText.tokens[oldToken].link = newToken;
}
}
Line 928:
 
// cycle trough new text tokens list
var i = text.newnewText.first;
while ( (i != null) && (text.newnewText.tokens[i] != null) ) {
var iNext = text.newnewText.tokens[i].next;
 
// find already connected pairs
var j = text.newnewText.tokens[i].link;
if (j != null) {
var jNext = text.oldoldText.tokens[j].next;
 
// check if the following tokens are not yet connected
if ( (iNext != null) && (jNext != null) ) {
if ( (text.newnewText.tokens[iNext].link == null) && (text.oldoldText.tokens[jNext].link == null) ) {
 
// connect if the following tokens are the same
if (text.newnewText.tokens[iNext].token == text.oldoldText.tokens[jNext].token) {
text.newnewText.tokens[iNext].link = jNext;
text.oldoldText.tokens[jNext].link = iNext;
}
}
Line 957:
 
// cycle trough new text tokens list
var i = text.newnewText.last;
while ( (i != null) && (text.newnewText.tokens[i] != null) ) {
var iNext = text.newnewText.tokens[i].prev;
 
// find already connected pairs
var j = text.newnewText.tokens[i].link;
if (j != null) {
var jNext = text.oldoldText.tokens[j].prev;
 
// check if the preceeding tokens are not yet connected
if ( (iNext != null) && (jNext != null) ) {
if ( (text.newnewText.tokens[iNext].link == null) && (text.oldoldText.tokens[jNext].link == null) ) {
 
// connect if the preceeding tokens are the same
if (text.newnewText.tokens[iNext].token == text.oldoldText.tokens[jNext].token) {
text.newnewText.tokens[iNext].link = jNext;
text.oldoldText.tokens[jNext].link = iNext;
}
}
Line 992:
var j = oldStart;
 
while ( (i != null) && (text.newnewText.tokens[i] != null) ) {
 
// get j from previous tokens match
var iPrev = text.newnewText.tokens[i].prev;
if (iPrev != null) {
var jPrev = text.newnewText.tokens[iPrev].link;
if (jPrev != null) {
j = text.oldoldText.tokens[jPrev].next;
}
}
 
// check for the start of an unresolved sequence
if ( (j != null) && (text.oldoldText.tokens[j] != null) && (text.newnewText.tokens[i].link == null) && (text.oldoldText.tokens[j].link == null) ) {
 
// determine the limits of of the unresolved new sequence
Line 1,011:
var iLength = 0;
var iNext = i;
while ( (iNext != null) && (text.newnewText.tokens[iNext].link == null) ) {
iEnd = iNext;
iLength ++;
Line 1,017:
break;
}
iNext = text.newnewText.tokens[iNext].next;
}
 
Line 1,025:
var jLength = 0;
var jNext = j;
while ( (jNext != null) && (text.oldoldText.tokens[jNext].link == null) ) {
jEnd = jNext;
jLength ++;
Line 1,031:
break;
}
jNext = text.oldoldText.tokens[jNext].next;
}
 
Line 1,049:
break;
}
i = text.newnewText.tokens[i].next;
}
 
Line 1,059:
var i = newEnd;
var j = oldEnd;
while ( (i != null) && (text.newnewText.tokens[i] != null) ) {
 
// get j from next matched tokens
var iPrev = text.newnewText.tokens[i].next;
if (iPrev != null) {
var jPrev = text.newnewText.tokens[iPrev].link;
if (jPrev != null) {
j = text.oldoldText.tokens[jPrev].prev;
}
}
 
// check for the start of an unresolved sequence
if ( (j != null) && (text.oldoldText.tokens[j] != null) && (text.newnewText.tokens[i].link == null) && (text.oldoldText.tokens[j].link == null) ) {
 
// determine the limits of of the unresolved new sequence
Line 1,078:
var iLength = 0;
var iNext = i;
while ( (iNext != null) && (text.newnewText.tokens[iNext].link == null) ) {
iStart = iNext;
iLength ++;
Line 1,084:
break;
}
iNext = text.newnewText.tokens[iNext].prev;
}
 
Line 1,092:
var jLength = 0;
var jNext = j;
while ( (jNext != null) && (text.oldoldText.tokens[jNext].link == null) ) {
jStart = jNext;
jLength ++;
Line 1,098:
break;
}
jNext = text.oldoldText.tokens[jNext].prev;
}
 
Line 1,116:
break;
}
i = text.newnewText.tokens[i].prev;
}
}
Line 1,157:
wDiff.DetectBlocks = function(text, blocks, groups) {
 
// WED('text.oldoldText', wDiff.DebugText(text.oldoldText));
// WED('text.newnewText', wDiff.DebugText(text.newnewText));
 
//
Line 1,165:
 
// cycle through old text to find matched (linked) blocks
var j = text.oldoldText.first;
var i = null;
var deletions = [];
Line 1,174:
var delEnd = null;
var string = '';
while ( (j != null) && (text.oldoldText.tokens[j].link == null) ) {
string += text.oldoldText.tokens[j].token;
delEnd = j;
j = text.oldoldText.tokens[j].next;
}
 
Line 1,191:
// get 'same' block
if (j != null) {
i = text.oldoldText.tokens[j].link;
var iStart = i;
var jStart = j;
Line 1,198:
var chars = 0;
var string = '';
while ( (i != null) && (j != null) && (text.oldoldText.tokens[j].link == i) ) {
var token = text.oldoldText.tokens[j].token;
chars += token.length;
string += token;
i = text.newnewText.tokens[i].next;
j = text.oldoldText.tokens[j].next;
}
 
Line 1,209:
blocks.push({
oldBlock: blocks.length,
oldNumber: text.oldoldText.tokens[jStart].number,
newNumber: text.newnewText.tokens[iStart].number,
chars: chars,
type: 'same',
Line 1,392:
 
// cycle through new text to find insertion blocks
var i = text.newnewText.first;
while (i != null) {
 
// jump over linked (matched) block
while ( (i != null) && (text.newnewText.tokens[i].link != null) ) {
i = text.newnewText.tokens[i].next;
}
 
Line 1,404:
var iStart = i;
var string = '';
while ( (i != null) && (text.newnewText.tokens[i].link == null) ) {
string += text.newnewText.tokens[i].token;
i = text.newnewText.tokens[i].next;
}
 
Line 1,413:
oldBlock: null,
oldNumber: null,
newNumber: text.newnewText.tokens[iStart].number,
chars: null,
type: 'ins',
Line 1,504:
blocks.push({
oldBlock: null,
oldNumber: text.oldoldText.tokens[ deletions[del].oldStart ].number,
newNumber: newNumber,
chars: null,
Line 2,256:
 
//
// wDiff.DebugText: dump text (text.oldoldText or text.newnewText) object
//