User:Cacycle/diff.js: Difference between revisions

Content deleted Content added
1.2.1a (October 14, 2014) fix CSS, fix 'one word became separated'
1.2.2 (October 146, 2014) added new line split, sped-up word regexp and wordParse(), fixed slide gaps, more timers, freed memory of words arrays, cleaned-up code
Line 3:
// ==UserScript==
// @name wikEd diff
// @version 1.2.1a2
// @date October 1416, 2014
// @description improved word-based diff library with block move detection
// @homepage https://en.wikipedia.org/wiki/User:Cacycle/diff
Line 468:
// Split into paragraphs, after double newlines
'paragraph': new RegExp(
'(.|\\n)*?((\\r\\n|\\n|\\r){2,}|[' +
this.config.regExpNewParagraph +
'])+',
'g'
),
 
// Split into sentences /[^ \n][^\n]*?[.!?;]+(?=[ \n]|$)|\r\n|\n|\r/lines
'line': new RegExp(
'\\r\\n|\\n|\\r|[' +
this.config.regExpNewLinesAll +
']',
'g'
),
 
// Split into sentences /[^ ].*?[.!?:;]+(?= |$)/
'sentence': new RegExp(
'[^' +
this.config.regExpBlanks +
this'].config*?[.regExpNewLinesAll!?:;' +
'][^' +
this.config.regExpNewLinesAll +
']*?[.!?;' +
this.config.regExpFullStops +
this.config.regExpExclamationMarks +
Line 487 ⟶ 492:
']+(?=[' +
this.config.regExpBlanks +
']|$)',
this.config.regExpNewLinesAll +
']|$)|[' +
this.config.regExpNewLines +
']|\\r\\n|\\n|\\r',
'g'
),
Line 507 ⟶ 509:
 
// Split into words, multi-char markup, and chars
// regExpLetters speed-up: \\w+
'word': new RegExp(
'(\\w+|[_' +
this.config.regExpLetters +
'])+([\'’_]?[_' +
this.config.regExpLetters +
']+*)*|\\[\\[|\\]\\]|\\{\\{|\\}\\}|&\\w+;|\'\'\'|\'\'|==+|\\{\\||\\|\\}|\\|-|.',
'g'
),
Line 544 ⟶ 547:
// RegExps for counting words
'countWords': new RegExp(
'(\\w+|[_' +
this.config.regExpLetters +
'])+([\'’_]?[_' +
this.config.regExpLetters +
']+ *)*',
'g'
),
Line 993 ⟶ 996:
 
// Split new and old text into paragraps
if ( this.config.timer === true ) {
this.time( 'paragraph split' );
}
this.newText.splitText( 'paragraph' );
this.oldText.splitText( 'paragraph' );
if ( this.config.timer === true ) {
this.timeEnd( 'paragraph split' );
}
 
// Calculate diff
this.calculateDiff( 'paragraphline' );
 
// Refine different paragraphs into sentenceslines
if ( this.config.timer === true ) {
this.time( 'line split' );
}
this.newText.splitRefine( 'line' );
this.oldText.splitRefine( 'line' );
if ( this.config.timer === true ) {
this.timeEnd( 'line split' );
}
 
// Calculate refined diff
this.calculateDiff( 'line' );
 
// Refine different lines into sentences
if ( this.config.timer === true ) {
this.time( 'sentence split' );
}
this.newText.splitRefine( 'sentence' );
this.oldText.splitRefine( 'sentence' );
if ( this.config.timer === true ) {
this.timeEnd( 'sentence split' );
}
 
// Calculate refined diff
this.calculateDiff( 'sentence' );
 
// Refine different paragraphssentences into chunks
if ( this.config.timer === true ) {
this.time( 'chunk split' );
Line 1,019 ⟶ 1,047:
this.calculateDiff( 'chunk' );
 
// Refine different sentenceschunks into words
if ( this.config.timer === true ) {
this.time( 'word split' );
Line 1,068 ⟶ 1,096:
}
 
// freeFree memory
this.symbols = undefined;
this.bordersDown = undefined;
this.bordersUp = undefined;
this.newText.words = undefined;
this.oldText.words = undefined;
 
// Enumerate token lists
Line 1,086 ⟶ 1,116:
}
 
// freeFree memory
this.newText.tokens = undefined;
this.oldText.tokens = undefined;
Line 1,093 ⟶ 1,123:
this.getDiffFragments();
 
// freeFree memory
this.blocks = undefined;
this.groups = undefined;
Line 1,184 ⟶ 1,214:
var i = this.newText.first;
var j = this.oldText.first;
while ( i !== null && this.newText.tokens[i] !== null ) {
 
// Get token links
Line 1,339 ⟶ 1,369:
if ( left < shorterToken.length / 2 && (right < shorterToken.length / 2) ) {
 
// Do not split into chars in this gap
charSplit = false;
break;
Line 1,435 ⟶ 1,465:
*/
this.slideGaps = function ( text, textLinked ) {
 
var regExpSlideBorder = this.config.regExp.slideBorder;
var regExpSlideStop = this.config.regExp.slideStop;
 
// Cycle through tokens list
var i = text.first;
var gapStart = null;
while ( i !== null && text.tokens[i] !== null ) {
 
// Remember gap start
Line 1,475 ⟶ 1,508:
var front = text.tokens[gapFront].prev;
var back = gapBack;
var gapFrontBlankTest = this.config.regExp.slideBorderregExpSlideBorder.test( text.tokens[gapFront].token );
var frontStop = front;
if ( text.tokens[back].link === null ) {
Line 1,484 ⟶ 1,517:
text.tokens[front].token === text.tokens[back].token
) {
front = text.tokens[front].prev;
back = text.tokens[back].prev;
if ( front !== null ) {
 
// Stop at line break
if ( this.config.regExp.slideStopregExpSlideStop.test( text.tokens[front].token ) === true ) {
frontStop = front;
break;
Line 1,496 ⟶ 1,527:
// Stop at first word border (blank/word or word/blank)
if (
this.config.regExp.slideBorderregExpSlideBorder.test( text.tokens[front].token ) !== gapFrontBlankTest ) {
gapFrontBlankTest
) {
frontStop = front;
}
}
front = text.tokens[front].prev;
back = text.tokens[back].prev;
}
}
Line 1,545 ⟶ 1,576:
*
* @param array symbols Symbol table object
* @param string level Split level: 'paragraph', 'line', 'sentence', 'chunk', 'word', or 'character'
*
* Optionally for recursive or repeated calls:
Line 1,612 ⟶ 1,643:
// Cycle through new text tokens list
var i = newStart;
while ( i !== null && this.newText.tokens[i] !== null ) {
if ( this.newText.tokens[i].link === null ) {
 
Line 1,618 ⟶ 1,649:
var token = this.newText.tokens[i].token;
if ( Object.prototype.hasOwnProperty.call( symbols.hashTable, token ) === false ) {
var currentsymbols.hashTable[token] = symbols.token.length;
symbols.hashTable[token].push( = current;{
symbols.token[current] = {
newCount: 1,
oldCount: 0,
newToken: i,
oldToken: null
} );
}
 
Line 1,657 ⟶ 1,687:
// Cycle through old text tokens list
var j = oldStart;
while ( j !== null && this.oldText.tokens[j] !== null ) {
if ( this.oldText.tokens[j].link === null ) {
 
Line 1,663 ⟶ 1,693:
var token = this.oldText.tokens[j].token;
if ( Object.prototype.hasOwnProperty.call( symbols.hashTable, token ) === false ) {
var currentsymbols.hashTable[token] = symbols.token.length;
symbols.hashTable[token].push( = current;{
symbols.token[current] = {
newCount: 0,
oldCount: 1,
newToken: null,
oldToken: j
} );
}
 
Line 1,711 ⟶ 1,740:
var newToken = symbols.token[i].newToken;
var oldToken = symbols.token[i].oldToken;
var newTokenObj = this.newText.tokens[newToken];
var oldTokenObj = this.oldText.tokens[oldToken];
 
// Connect from new to old and from old to new
if ( this.newText.tokens[newToken]newTokenObj.link === null ) {
 
// Do not use spaces as unique markers
if (
this.config.regExp.blankOnlyToken.test( this.newText.tokens[newToken]newTokenObj.token ) === true
) {
 
// Link new anand old tokens
this.newText.tokens[newToken]newTokenObj.link = oldToken;
this.oldText.tokens[oldToken]oldTokenObj.link = newToken;
symbols.linked = true;
 
Line 1,736 ⟶ 1,767:
}
else {
var token = this.newText.tokens[newToken]newTokenObj.token;
var words =
( token.match( this.config.regExp.countWords ) || [] ).length +
Line 1,763 ⟶ 1,794:
// Set unique
if ( unique === true ) {
this.newText.tokens[newToken]newTokenObj.unique = true;
this.oldText.tokens[oldToken]oldTokenObj.unique = true;
}
}
Line 2,065 ⟶ 2,096:
 
// Set longest sequence of increasing groups in sections as fixed (not moved)
if ( this.config.timer === true ) {
this.time( 'setFixed' );
}
this.setFixed();
if ( this.config.timer === true ) {
this.time( 'setFixed' );
}
 
// Convert groups to insertions/deletions if maximum block length is too short
Line 2,077 ⟶ 2,102:
if ( this.config.unlinkBlocks === true && this.config.blockMinLength > 0 ) {
if ( this.config.timer === true ) {
this.time( 'unlinktotal unlinking' );
}
 
Line 2,101 ⟶ 2,126:
}
if ( this.config.timer === true ) {
this.timeEnd( 'unlinktotal unlinking' );
}
}
Line 2,139 ⟶ 2,164:
*/
this.getSameBlocks = function () {
 
if ( this.config.timer === true ) {
this.time( 'getSameBlocks' );
}
 
var blocks = this.blocks;
Line 2,166 ⟶ 2,195:
var text = '';
while ( i !== null && j !== null && this.oldText.tokens[j].link === i ) {
var tokentext += this.oldText.tokens[j].token;
count ++;
if ( this.newText.tokens[i].unique === true ) {
unique = true;
}
text += token;
i = this.newText.tokens[i].next;
j = this.oldText.tokens[j].next;
Line 2,206 ⟶ 2,234:
for ( var block = 0; block < blocksLength; block ++ ) {
blocks[block].newBlock = block;
}
 
if ( this.config.timer === true ) {
this.timeEnd( 'getSameBlocks' );
}
return;
Line 2,219 ⟶ 2,251:
*/
this.getSections = function () {
 
if ( this.config.timer === true ) {
this.time( 'getSections' );
}
 
var blocks = this.blocks;
Line 2,264 ⟶ 2,300:
block = sectionEnd;
}
}
if ( this.config.timer === true ) {
this.timeEnd( 'getSections' );
}
return;
Line 2,276 ⟶ 2,315:
*/
this.getGroups = function () {
 
if ( this.config.timer === true ) {
this.time( 'getGroups' );
}
 
var blocks = this.blocks;
Line 2,347 ⟶ 2,390:
block = groupEnd;
}
}
if ( this.config.timer === true ) {
this.timeEnd( 'getGroups' );
}
return;
Line 2,360 ⟶ 2,406:
*/
this.setFixed = function () {
 
if ( this.config.timer === true ) {
this.time( 'setFixed' );
}
 
var blocks = this.blocks;
Line 2,399 ⟶ 2,449:
}
}
}
if ( this.config.timer === true ) {
this.timeEnd( 'setFixed' );
}
return;
Line 2,455 ⟶ 2,508:
 
return returnObj;
};
 
 
/**
* Convert matching '=' blocks in groups into insertion/deletion ('+'/'-') pairs
* if too short and too common.
* Prevents fragmentated diffs for very different versions.
*
* @param[in] array blocks Blocks table object
* @param[in/out] WikEdDiffText newText, oldText Text object, linked property
* @param[in/out] array groups Groups table object
* @return bool True if text tokens were unlinked
*/
this.unlinkBlocks = function () {
 
var blocks = this.blocks;
var groups = this.groups;
 
// Cycle through groups
var unlinked = false;
var groupsLength = groups.length;
for ( var group = 0; group < groupsLength; group ++ ) {
var blockStart = groups[group].blockStart;
var blockEnd = groups[group].blockEnd;
 
// Unlink whole group if no block is at least blockMinLength words long and unique
if ( groups[group].maxWords < this.config.blockMinLength && groups[group].unique === false ) {
for ( var block = blockStart; block <= blockEnd; block ++ ) {
if ( blocks[block].type === '=' ) {
this.unlinkSingleBlock( blocks[block] );
unlinked = true;
}
}
}
 
// Otherwise unlink block flanks
else {
 
// Unlink blocks from start
for ( var block = blockStart; block <= blockEnd; block ++ ) {
if ( blocks[block].type === '=' ) {
 
// Stop unlinking if more than one word or a unique word
if ( blocks[block].words > 1 || blocks[block].unique === true ) {
break;
}
this.unlinkSingleBlock( blocks[block] );
unlinked = true;
blockStart = block;
}
}
 
// Unlink blocks from end
for ( var block = blockEnd; block > blockStart; block -- ) {
if ( blocks[block].type === '=' ) {
 
// Stop unlinking if more than one word or a unique word
if (
blocks[block].words > 1 ||
( blocks[block].words === 1 && blocks[block].unique === true )
) {
break;
}
this.unlinkSingleBlock( blocks[block] );
unlinked = true;
}
}
}
}
return unlinked;
};
 
 
/**
* Unlink text tokens of single block, convert them into into insertion/deletion ('+'/'-') pairs.
*
* @param[in] array blocks Blocks table object
* @param[out] WikEdDiffText newText, oldText Text objects, link property
*/
this.unlinkSingleBlock = function ( block ) {
 
// Cycle through old text
var j = block.oldStart;
for ( var count = 0; count < block.count; count ++ ) {
 
// Unlink tokens
this.newText.tokens[ this.oldText.tokens[j].link ].link = null;
this.oldText.tokens[j].link = null;
j = this.oldText.tokens[j].next;
}
return;
};
 
Line 2,465 ⟶ 2,609:
*/
this.getDelBlocks = function () {
 
if ( this.config.timer === true ) {
this.time( 'getDelBlocks' );
}
 
var blocks = this.blocks;
Line 2,512 ⟶ 2,660:
}
}
}
if ( this.config.timer === true ) {
this.timeEnd( 'getDelBlocks' );
}
return;
Line 2,533 ⟶ 2,684:
*/
this.positionDelBlocks = function () {
 
if ( this.config.timer === true ) {
this.time( 'positionDelBlocks' );
}
 
var blocks = this.blocks;
Line 2,628 ⟶ 2,783:
this.sortBlocks();
 
if ( this.config.timer === true ) {
return;
this.timeEnd( 'positionDelBlocks' );
};
 
 
/**
* Convert matching '=' blocks in groups into insertion/deletion ('+'/'-') pairs
* if too short and too common.
* Prevents fragmentated diffs for very different versions.
*
* @param[in] array blocks Blocks table object
* @param[in/out] WikEdDiffText newText, oldText Text object, linked property
* @param[in/out] array groups Groups table object
* @return bool True if text tokens were unlinked
*/
this.unlinkBlocks = function () {
 
var blocks = this.blocks;
var groups = this.groups;
 
// Cycle through groups
var unlinked = false;
var groupsLength = groups.length;
for ( var group = 0; group < groupsLength; group ++ ) {
var blockStart = groups[group].blockStart;
var blockEnd = groups[group].blockEnd;
// Unlink whole group if no block is at least blockMinLength words long and unique
if ( groups[group].maxWords < this.config.blockMinLength && groups[group].unique === false ) {
for ( var block = blockStart; block <= blockEnd; block ++ ) {
if ( blocks[block].type === '=' ) {
this.unlinkSingleBlock( blocks[block] );
unlinked = true;
}
}
}
 
// Otherwise unlink block flanks
else {
 
// Unlink blocks from start
for ( var block = blockStart; block <= blockEnd; block ++ ) {
if ( blocks[block].type === '=' ) {
 
// Stop unlinking if more than one word or a unique word
if ( blocks[block].words > 1 || blocks[block].unique === true ) {
break;
}
this.unlinkSingleBlock( blocks[block] );
unlinked = true;
blockStart = block;
}
}
 
// Unlink blocks from end
for ( var block = blockEnd; block > blockStart; block -- ) {
if ( blocks[block].type === '=' ) {
 
// Stop unlinking if more than one word or a unique word
if (
blocks[block].words > 1 ||
( blocks[block].words === 1 && blocks[block].unique === true )
) {
break;
}
this.unlinkSingleBlock( blocks[block] );
unlinked = true;
}
}
}
}
return unlinked;
};
 
 
/**
* Unlink text tokens of single block, convert them into into insertion/deletion ('+'/'-') pairs.
*
* @param[in] array blocks Blocks table object
* @param[out] WikEdDiffText newText, oldText Text objects, link property
*/
this.unlinkSingleBlock = function ( block ) {
 
// Cycle through old text
var j = block.oldStart;
for ( var count = 0; count < block.count; count ++ ) {
 
// Unlink tokens
this.newText.tokens[ this.oldText.tokens[j].link ].link = null;
this.oldText.tokens[j].link = null;
j = this.oldText.tokens[j].next;
}
return;
Line 2,729 ⟶ 2,797:
*/
this.getInsBlocks = function () {
 
if ( this.config.timer === true ) {
this.time( 'getInsBlocks' );
}
 
var blocks = this.blocks;
Line 2,776 ⟶ 2,848:
this.sortBlocks();
 
if ( this.config.timer === true ) {
this.timeEnd( 'getInsBlocks' );
}
return;
};
Line 2,825 ⟶ 2,900:
*/
this.setInsGroups = function () {
 
if ( this.config.timer === true ) {
this.time( 'setInsGroups' );
}
 
var blocks = this.blocks;
Line 2,865 ⟶ 2,944:
} );
}
}
if ( this.config.timer === true ) {
this.timeEnd( 'setInsGroups' );
}
return;
Line 2,892 ⟶ 2,974:
*/
this.insertMarks = function () {
 
if ( this.config.timer === true ) {
this.time( 'insertMarks' );
}
 
var blocks = this.blocks;
Line 3,025 ⟶ 3,111:
this.sortBlocks();
 
if ( this.config.timer === true ) {
this.timeEnd( 'insertMarks' );
}
return;
};
Line 4,055 ⟶ 4,144:
*
* @param string label Timer label
* @param[out] array timer Current time in milliseconds (float)
*/
this.time = function ( label ) {
Line 4,071 ⟶ 4,160:
* @param string label Timer label
* @param bool noLog Do not log result
* @return float Time in milliseconds, rounded to two decimal digits
*/
this.timeEnd = function ( label, noLog ) {
Line 4,082 ⟶ 4,171:
this.timer[label] = undefined;
if ( noLog !== true ) {
console.log( label + ': ' + diff.toFixed( 2 ) + ' ms' );
}
}
Line 4,109 ⟶ 4,198:
var timerLength = this.recursionTimer.length;
for ( var i = 0; i < timerLength; i ++ ) {
console.log( text + ' recursion ' + i + ': ' + this.recursionTimer[i].toFixed( 2 ) + ' ms\n' );
}
}
Line 4,273 ⟶ 4,362:
this.wordParse = function ( regExp ) {
 
var words = this.text.match( regExp );
var regExpMatch;
whileif ( ( regExpMatch = regExp.exec( this.text ) )words !== null ) {
var wordwordsLength = regExpMatch[0]words.length;
iffor (var this.words[word]i === undefined0; i < wordsLength; i ++) {
var wordCounter = this.words[word words[i] = 1];
if ( wordCounter === undefined ) {
}
wordCounter = 1;
else {
}
this.words[word] ++;
else {
wordCounter ++;
}
}
}
Line 4,319 ⟶ 4,411:
var regExpMatch;
var lastIndex = 0;
whilevar ( ( regExpMatchregExp = this.parent.config.regExp.split[level].exec( text ) ) !== null ) {;
while ( ( regExpMatch = regExp.exec( text ) ) !== null ) {
if ( regExpMatch.index > lastIndex ) {
split.push( text.substring( lastIndex, regExpMatch.index ) );
}
split.push( regExpMatch[0] );
lastIndex = this.parent.config.regExp.split[level].lastIndex;
}
if ( lastIndex < text.length ) {
Line 4,335 ⟶ 4,428:
 
// Insert current item, link to previous
this.tokens[current] =.push( {
token: split[i],
prev: prev,
Line 4,342 ⟶ 4,435:
number: null,
unique: false
} );
number ++;
 
Line 4,396 ⟶ 4,489:
// Cycle through tokens list
var i = this.first;
while ( i !== null && this.tokens[i] !== null ) {
 
// Refine unique unmatched tokens into smaller tokens
Line 4,418 ⟶ 4,511:
var number = 0;
var i = this.first;
while ( i !== null && this.tokens[i] !== null ) {
this.tokens[i].number = number;
number ++;
Line 4,440 ⟶ 4,533:
dump += '\ni \tlink \t(prev \tnext) \tuniq \t#num \t"token"\n';
var i = this.first;
while ( i !== null && tokens[i] !== null ) {
dump +=
i + ' \t' + tokens[i].link + ' \t(' + tokens[i].prev + ' \t' + tokens[i].next + ') \t' +