Content deleted Content added
1.2.1a (October 14, 2014) fix CSS, fix 'one word became separated' |
1.2.2 (October 146, 2014) added new line split, sped-up word regexp and wordParse(), fixed slide gaps, more timers, freed memory of words arrays, cleaned-up code |
||
Line 3:
// ==UserScript==
// @name wikEd diff
// @version 1.2.
// @date October
// @description improved word-based diff library with block move detection
// @homepage https://en.wikipedia.org/wiki/User:Cacycle/diff
Line 468:
// Split into paragraphs, after double newlines
'paragraph': new RegExp(
'
this.config.regExpNewParagraph +
']
'g'
),
// Split into
'line': new RegExp(
'\\r\\n|\\n|\\r|[' +
this.config.regExpNewLinesAll +
']',
'g'
),
// Split into sentences /[^ ].*?[.!?:;]+(?= |$)/
'sentence': new RegExp(
'[^' +
this.config.regExpBlanks +
this.config.regExpFullStops +
this.config.regExpExclamationMarks +
Line 487 ⟶ 492:
']+(?=[' +
this.config.regExpBlanks +
']|$)',
'g'
),
Line 507 ⟶ 509:
// Split into words, multi-char markup, and chars
// regExpLetters speed-up: \\w+
'word': new RegExp(
'(\\w+|[_' +
this.config.regExpLetters +
'])+([\'
this.config.regExpLetters +
']
'g'
),
Line 544 ⟶ 547:
// RegExps for counting words
'countWords': new RegExp(
'(\\w+|[_' +
this.config.regExpLetters +
'])+([\'
this.config.regExpLetters +
']
'g'
),
Line 993 ⟶ 996:
// Split new and old text into paragraps
if ( this.config.timer === true ) {
this.time( 'paragraph split' );
}
this.newText.splitText( 'paragraph' );
this.oldText.splitText( 'paragraph' );
if ( this.config.timer === true ) {
this.timeEnd( 'paragraph split' );
}
// Calculate diff
this.calculateDiff( '
// Refine different paragraphs into
if ( this.config.timer === true ) {
this.time( 'line split' );
}
this.newText.splitRefine( 'line' );
this.oldText.splitRefine( 'line' );
if ( this.config.timer === true ) {
this.timeEnd( 'line split' );
}
// Calculate refined diff
this.calculateDiff( 'line' );
// Refine different lines into sentences
if ( this.config.timer === true ) {
this.time( 'sentence split' );
}
this.newText.splitRefine( 'sentence' );
this.oldText.splitRefine( 'sentence' );
if ( this.config.timer === true ) {
this.timeEnd( 'sentence split' );
}
// Calculate refined diff
this.calculateDiff( 'sentence' );
// Refine different
if ( this.config.timer === true ) {
this.time( 'chunk split' );
Line 1,019 ⟶ 1,047:
this.calculateDiff( 'chunk' );
// Refine different
if ( this.config.timer === true ) {
this.time( 'word split' );
Line 1,068 ⟶ 1,096:
}
//
this.symbols = undefined;
this.bordersDown = undefined;
this.bordersUp = undefined;
this.newText.words = undefined;
this.oldText.words = undefined;
// Enumerate token lists
Line 1,086 ⟶ 1,116:
}
//
this.newText.tokens = undefined;
this.oldText.tokens = undefined;
Line 1,093 ⟶ 1,123:
this.getDiffFragments();
//
this.blocks = undefined;
this.groups = undefined;
Line 1,184 ⟶ 1,214:
var i = this.newText.first;
var j = this.oldText.first;
while ( i
// Get token links
Line 1,339 ⟶ 1,369:
if ( left < shorterToken.length / 2 && (right < shorterToken.length / 2) ) {
// Do not split into chars in this gap
charSplit = false;
break;
Line 1,435 ⟶ 1,465:
*/
this.slideGaps = function ( text, textLinked ) {
var regExpSlideBorder = this.config.regExp.slideBorder;
var regExpSlideStop = this.config.regExp.slideStop;
// Cycle through tokens list
var i = text.first;
var gapStart = null;
while ( i
// Remember gap start
Line 1,475 ⟶ 1,508:
var front = text.tokens[gapFront].prev;
var back = gapBack;
var gapFrontBlankTest =
var frontStop = front;
if ( text.tokens[back].link === null ) {
Line 1,484 ⟶ 1,517:
text.tokens[front].token === text.tokens[back].token
) {
if ( front !== null ) {
// Stop at line break
if (
frontStop = front;
break;
Line 1,496 ⟶ 1,527:
// Stop at first word border (blank/word or word/blank)
if (
frontStop = front;
}
}
front = text.tokens[front].prev;
back = text.tokens[back].prev;
}
}
Line 1,545 ⟶ 1,576:
*
* @param array symbols Symbol table object
* @param string level Split level: 'paragraph', 'line', 'sentence', 'chunk', 'word',
*
* Optionally for recursive or repeated calls:
Line 1,612 ⟶ 1,643:
// Cycle through new text tokens list
var i = newStart;
while ( i
if ( this.newText.tokens[i].link === null ) {
Line 1,618 ⟶ 1,649:
var token = this.newText.tokens[i].token;
if ( Object.prototype.hasOwnProperty.call( symbols.hashTable, token ) === false ) {
symbols.
newCount: 1,
oldCount: 0,
newToken: i,
oldToken: null
} );
}
Line 1,657 ⟶ 1,687:
// Cycle through old text tokens list
var j = oldStart;
while ( j
if ( this.oldText.tokens[j].link === null ) {
Line 1,663 ⟶ 1,693:
var token = this.oldText.tokens[j].token;
if ( Object.prototype.hasOwnProperty.call( symbols.hashTable, token ) === false ) {
symbols.
newCount: 0,
oldCount: 1,
newToken: null,
oldToken: j
} );
}
Line 1,711 ⟶ 1,740:
var newToken = symbols.token[i].newToken;
var oldToken = symbols.token[i].oldToken;
var newTokenObj = this.newText.tokens[newToken];
var oldTokenObj = this.oldText.tokens[oldToken];
// Connect from new to old and from old to new
if (
// Do not use spaces as unique markers
if (
this.config.regExp.blankOnlyToken.test(
) {
// Link new
symbols.linked = true;
Line 1,736 ⟶ 1,767:
}
else {
var token =
var words =
( token.match( this.config.regExp.countWords ) || [] ).length +
Line 1,763 ⟶ 1,794:
// Set unique
if ( unique === true ) {
}
}
Line 2,065 ⟶ 2,096:
// Set longest sequence of increasing groups in sections as fixed (not moved)
this.setFixed();
// Convert groups to insertions/deletions if maximum block length is too short
Line 2,077 ⟶ 2,102:
if ( this.config.unlinkBlocks === true && this.config.blockMinLength > 0 ) {
if ( this.config.timer === true ) {
this.time( '
}
Line 2,101 ⟶ 2,126:
}
if ( this.config.timer === true ) {
this.timeEnd( '
}
}
Line 2,139 ⟶ 2,164:
*/
this.getSameBlocks = function () {
if ( this.config.timer === true ) {
this.time( 'getSameBlocks' );
}
var blocks = this.blocks;
Line 2,166 ⟶ 2,195:
var text = '';
while ( i !== null && j !== null && this.oldText.tokens[j].link === i ) {
count ++;
if ( this.newText.tokens[i].unique === true ) {
unique = true;
}
i = this.newText.tokens[i].next;
j = this.oldText.tokens[j].next;
Line 2,206 ⟶ 2,234:
for ( var block = 0; block < blocksLength; block ++ ) {
blocks[block].newBlock = block;
}
if ( this.config.timer === true ) {
this.timeEnd( 'getSameBlocks' );
}
return;
Line 2,219 ⟶ 2,251:
*/
this.getSections = function () {
if ( this.config.timer === true ) {
this.time( 'getSections' );
}
var blocks = this.blocks;
Line 2,264 ⟶ 2,300:
block = sectionEnd;
}
}
if ( this.config.timer === true ) {
this.timeEnd( 'getSections' );
}
return;
Line 2,276 ⟶ 2,315:
*/
this.getGroups = function () {
if ( this.config.timer === true ) {
this.time( 'getGroups' );
}
var blocks = this.blocks;
Line 2,347 ⟶ 2,390:
block = groupEnd;
}
}
if ( this.config.timer === true ) {
this.timeEnd( 'getGroups' );
}
return;
Line 2,360 ⟶ 2,406:
*/
this.setFixed = function () {
if ( this.config.timer === true ) {
this.time( 'setFixed' );
}
var blocks = this.blocks;
Line 2,399 ⟶ 2,449:
}
}
}
if ( this.config.timer === true ) {
this.timeEnd( 'setFixed' );
}
return;
Line 2,455 ⟶ 2,508:
return returnObj;
};
/**
* Convert matching '=' blocks in groups into insertion/deletion ('+'/'-') pairs
* if too short and too common.
* Prevents fragmentated diffs for very different versions.
*
* @param[in] array blocks Blocks table object
* @param[in/out] WikEdDiffText newText, oldText Text object, linked property
* @param[in/out] array groups Groups table object
* @return bool True if text tokens were unlinked
*/
this.unlinkBlocks = function () {
var blocks = this.blocks;
var groups = this.groups;
// Cycle through groups
var unlinked = false;
var groupsLength = groups.length;
for ( var group = 0; group < groupsLength; group ++ ) {
var blockStart = groups[group].blockStart;
var blockEnd = groups[group].blockEnd;
// Unlink whole group if no block is at least blockMinLength words long and unique
if ( groups[group].maxWords < this.config.blockMinLength && groups[group].unique === false ) {
for ( var block = blockStart; block <= blockEnd; block ++ ) {
if ( blocks[block].type === '=' ) {
this.unlinkSingleBlock( blocks[block] );
unlinked = true;
}
}
}
// Otherwise unlink block flanks
else {
// Unlink blocks from start
for ( var block = blockStart; block <= blockEnd; block ++ ) {
if ( blocks[block].type === '=' ) {
// Stop unlinking if more than one word or a unique word
if ( blocks[block].words > 1 || blocks[block].unique === true ) {
break;
}
this.unlinkSingleBlock( blocks[block] );
unlinked = true;
blockStart = block;
}
}
// Unlink blocks from end
for ( var block = blockEnd; block > blockStart; block -- ) {
if ( blocks[block].type === '=' ) {
// Stop unlinking if more than one word or a unique word
if (
blocks[block].words > 1 ||
( blocks[block].words === 1 && blocks[block].unique === true )
) {
break;
}
this.unlinkSingleBlock( blocks[block] );
unlinked = true;
}
}
}
}
return unlinked;
};
/**
* Unlink text tokens of single block, convert them into into insertion/deletion ('+'/'-') pairs.
*
* @param[in] array blocks Blocks table object
* @param[out] WikEdDiffText newText, oldText Text objects, link property
*/
this.unlinkSingleBlock = function ( block ) {
// Cycle through old text
var j = block.oldStart;
for ( var count = 0; count < block.count; count ++ ) {
// Unlink tokens
this.newText.tokens[ this.oldText.tokens[j].link ].link = null;
this.oldText.tokens[j].link = null;
j = this.oldText.tokens[j].next;
}
return;
};
Line 2,465 ⟶ 2,609:
*/
this.getDelBlocks = function () {
if ( this.config.timer === true ) {
this.time( 'getDelBlocks' );
}
var blocks = this.blocks;
Line 2,512 ⟶ 2,660:
}
}
}
if ( this.config.timer === true ) {
this.timeEnd( 'getDelBlocks' );
}
return;
Line 2,533 ⟶ 2,684:
*/
this.positionDelBlocks = function () {
if ( this.config.timer === true ) {
this.time( 'positionDelBlocks' );
}
var blocks = this.blocks;
Line 2,628 ⟶ 2,783:
this.sortBlocks();
if ( this.config.timer === true ) {
this.timeEnd( 'positionDelBlocks' );
}
return;
Line 2,729 ⟶ 2,797:
*/
this.getInsBlocks = function () {
if ( this.config.timer === true ) {
this.time( 'getInsBlocks' );
}
var blocks = this.blocks;
Line 2,776 ⟶ 2,848:
this.sortBlocks();
if ( this.config.timer === true ) {
this.timeEnd( 'getInsBlocks' );
}
return;
};
Line 2,825 ⟶ 2,900:
*/
this.setInsGroups = function () {
if ( this.config.timer === true ) {
this.time( 'setInsGroups' );
}
var blocks = this.blocks;
Line 2,865 ⟶ 2,944:
} );
}
}
if ( this.config.timer === true ) {
this.timeEnd( 'setInsGroups' );
}
return;
Line 2,892 ⟶ 2,974:
*/
this.insertMarks = function () {
if ( this.config.timer === true ) {
this.time( 'insertMarks' );
}
var blocks = this.blocks;
Line 3,025 ⟶ 3,111:
this.sortBlocks();
if ( this.config.timer === true ) {
this.timeEnd( 'insertMarks' );
}
return;
};
Line 4,055 ⟶ 4,144:
*
* @param string label Timer label
* @param[out] array timer Current time in milliseconds (float)
*/
this.time = function ( label ) {
Line 4,071 ⟶ 4,160:
* @param string label Timer label
* @param bool noLog Do not log result
* @return float Time in milliseconds
*/
this.timeEnd = function ( label, noLog ) {
Line 4,082 ⟶ 4,171:
this.timer[label] = undefined;
if ( noLog !== true ) {
console.log( label + ': ' + diff.toFixed( 2 ) + ' ms' );
}
}
Line 4,109 ⟶ 4,198:
var timerLength = this.recursionTimer.length;
for ( var i = 0; i < timerLength; i ++ ) {
console.log( text + ' recursion ' + i + ': ' + this.recursionTimer[i].toFixed( 2 ) + ' ms
}
}
Line 4,273 ⟶ 4,362:
this.wordParse = function ( regExp ) {
var words = this.text.match( regExp );
var
var wordCounter = this.words[
if ( wordCounter === undefined ) {
wordCounter = 1;
}
else {
wordCounter ++;
}
}
}
Line 4,319 ⟶ 4,411:
var regExpMatch;
var lastIndex = 0;
while ( ( regExpMatch = regExp.exec( text ) ) !== null ) {
if ( regExpMatch.index > lastIndex ) {
split.push( text.substring( lastIndex, regExpMatch.index ) );
}
split.push( regExpMatch[0] );
lastIndex =
}
if ( lastIndex < text.length ) {
Line 4,335 ⟶ 4,428:
// Insert current item, link to previous
this.tokens
token: split[i],
prev: prev,
Line 4,342 ⟶ 4,435:
number: null,
unique: false
} );
number ++;
Line 4,396 ⟶ 4,489:
// Cycle through tokens list
var i = this.first;
while ( i
// Refine unique unmatched tokens into smaller tokens
Line 4,418 ⟶ 4,511:
var number = 0;
var i = this.first;
while ( i
this.tokens[i].number = number;
number ++;
Line 4,440 ⟶ 4,533:
dump += '\ni \tlink \t(prev \tnext) \tuniq \t#num \t"token"\n';
var i = this.first;
while ( i
dump +=
i + ' \t' + tokens[i].link + ' \t(' + tokens[i].prev + ' \t' + tokens[i].next + ') \t' +
|