Content deleted Content added
1.2.1a (October 14, 2014) fix CSS, fix 'one word became separated' |
another background that could use some darkmode friendly color |
||
(7 intermediate revisions by 3 users not shown) | |||
Line 3:
// ==UserScript==
// @name wikEd diff
// @version 1.2.
// @date October
// @description improved word-based diff library with block move detection
// @homepage https://en.wikipedia.org/wiki/User:Cacycle/diff
Line 36:
* - Resolution down to characters level
* - Unicode and multilingual support
* - Stepwise split (paragraphs, lines, sentences, words, characters)
* - Recursive diff
* - Optimized code for resolving unmatched sequences
Line 70:
* .newText new text
* .oldText old text
* .maxWords word count of longest linked block
* .html diff html
* .error
* .bordersDown[] linked region borders downwards, [new index, old index]
* .bordersUp[] linked region borders upwards, [new index, old index]
Line 109 ⟶ 110:
* .blockEnd last block index
* .unique contains unique linked token
* .maxWords word count of longest linked block
* .words word count
* .chars char count
Line 368 ⟶ 369:
'border-radius: 0.25em; padding: 0.2em 1px; margin: 0 1px; ' +
'} ' +
'.wikEdDiffBlock { color: #000; } ' +
'.wikEdDiffBlock0 { background-color: #ffff80; } ' +
'.wikEdDiffBlock1 { background-color: #d0ff80; } ' +
Line 407 ⟶ 408:
'.wikEdDiffContainer { } ' +
'.wikEdDiffFragment {' +
'white-space: pre-wrap; background-color: var(--background-color-base, #fff); border: #bbb solid; ' +
'border-width: 1px 1px 1px 0.5em; border-radius: 0.5em; font-family: sans-serif; ' +
'font-size: 88%; line-height: 1.6; box-shadow: 2px 2px 2px #ddd; padding: 1em; margin: 0; ' +
'} ' +
'.wikEdDiffNoChange { background: var(--background-color-interactive, #
'line-height: 1.6; box-shadow: 2px 2px 2px #ddd; padding: 0.5em; margin: 1em 0; ' +
'text-align: center; ' +
Line 468 ⟶ 469:
// Split into paragraphs, after double newlines
'paragraph': new RegExp(
'
this.config.regExpNewParagraph +
']
'g'
),
// Split into
'line': new RegExp(
'\\r\\n|\\n|\\r|[' +
this.config.regExpNewLinesAll +
']',
'g'
),
// Split into sentences /[^ ].*?[.!?:;]+(?= |$)/
'sentence': new RegExp(
'[^' +
this.config.regExpBlanks +
this.config.regExpFullStops +
this.config.regExpExclamationMarks +
Line 487 ⟶ 493:
']+(?=[' +
this.config.regExpBlanks +
']|$)',
'g'
),
Line 507 ⟶ 510:
// Split into words, multi-char markup, and chars
// regExpLetters speed-up: \\w+
'word': new RegExp(
'(\\w+|[_' +
this.config.regExpLetters +
'])+([\'
this.config.regExpLetters +
']
'g'
),
Line 544 ⟶ 548:
// RegExps for counting words
'countWords': new RegExp(
'(\\w+|[_' +
this.config.regExpLetters +
'])+([\'
this.config.regExpLetters +
']
'g'
),
Line 850 ⟶ 854:
/** @var array blocks Block data (consecutive text tokens) in new text order */
this.blocks = [];
/** @var int maxWords Maximal detected word count of all linked blocks */
this.maxWords = 0;
/** @var array groups Section blocks that are consecutive in old text order */
Line 993 ⟶ 1,000:
// Split new and old text into paragraps
if ( this.config.timer === true ) {
this.time( 'paragraph split' );
}
this.newText.splitText( 'paragraph' );
this.oldText.splitText( 'paragraph' );
if ( this.config.timer === true ) {
this.timeEnd( 'paragraph split' );
}
// Calculate diff
this.calculateDiff( '
// Refine different paragraphs into
if ( this.config.timer === true ) {
this.time( 'line split' );
}
this.newText.splitRefine( 'line' );
this.oldText.splitRefine( 'line' );
if ( this.config.timer === true ) {
this.timeEnd( 'line split' );
}
// Calculate refined diff
this.calculateDiff( 'line' );
// Refine different lines into sentences
if ( this.config.timer === true ) {
this.time( 'sentence split' );
}
this.newText.splitRefine( 'sentence' );
this.oldText.splitRefine( 'sentence' );
if ( this.config.timer === true ) {
this.timeEnd( 'sentence split' );
}
// Calculate refined diff
this.calculateDiff( 'sentence' );
// Refine different
if ( this.config.timer === true ) {
this.time( 'chunk split' );
Line 1,019 ⟶ 1,051:
this.calculateDiff( 'chunk' );
// Refine different
if ( this.config.timer === true ) {
this.time( 'word split' );
Line 1,068 ⟶ 1,100:
}
//
this.symbols = undefined;
this.bordersDown = undefined;
this.bordersUp = undefined;
this.newText.words = undefined;
this.oldText.words = undefined;
// Enumerate token lists
Line 1,086 ⟶ 1,120:
}
//
this.newText.tokens = undefined;
this.oldText.tokens = undefined;
Line 1,093 ⟶ 1,127:
this.getDiffFragments();
//
this.blocks = undefined;
this.groups = undefined;
Line 1,184 ⟶ 1,218:
var i = this.newText.first;
var j = this.oldText.first;
while ( i
// Get token links
Line 1,339 ⟶ 1,373:
if ( left < shorterToken.length / 2 && (right < shorterToken.length / 2) ) {
// Do not split into chars in this gap
charSplit = false;
break;
Line 1,435 ⟶ 1,469:
*/
this.slideGaps = function ( text, textLinked ) {
var regExpSlideBorder = this.config.regExp.slideBorder;
var regExpSlideStop = this.config.regExp.slideStop;
// Cycle through tokens list
var i = text.first;
var gapStart = null;
while ( i
// Remember gap start
Line 1,475 ⟶ 1,512:
var front = text.tokens[gapFront].prev;
var back = gapBack;
var gapFrontBlankTest =
var frontStop = front;
if ( text.tokens[back].link === null ) {
Line 1,484 ⟶ 1,521:
text.tokens[front].token === text.tokens[back].token
) {
if ( front !== null ) {
// Stop at line break
if (
frontStop = front;
break;
Line 1,496 ⟶ 1,531:
// Stop at first word border (blank/word or word/blank)
if (
frontStop = front;
}
}
front = text.tokens[front].prev;
back = text.tokens[back].prev;
}
}
Line 1,545 ⟶ 1,580:
*
* @param array symbols Symbol table object
* @param string level Split level: 'paragraph', 'line', 'sentence', 'chunk', 'word',
*
* Optionally for recursive or repeated calls:
Line 1,580 ⟶ 1,615:
}
//
var symbols;
var bordersDown;
Line 1,590 ⟶ 1,625:
}
//
else {
symbols = {
Line 1,602 ⟶ 1,637:
//
var bordersUpNext = [];
var bordersDownNext = [];
Line 1,612 ⟶ 1,647:
// Cycle through new text tokens list
var i = newStart;
while ( i
if ( this.newText.tokens[i].link === null ) {
Line 1,618 ⟶ 1,653:
var token = this.newText.tokens[i].token;
if ( Object.prototype.hasOwnProperty.call( symbols.hashTable, token ) === false ) {
symbols.
newCount: 1,
oldCount: 0,
newToken: i,
oldToken: null
} );
}
Line 1,642 ⟶ 1,676:
}
//
if ( up === false ) {
i = this.newText.tokens[i].next;
Line 1,657 ⟶ 1,691:
// Cycle through old text tokens list
var j = oldStart;
while ( j
if ( this.oldText.tokens[j].link === null ) {
Line 1,663 ⟶ 1,697:
var token = this.oldText.tokens[j].token;
if ( Object.prototype.hasOwnProperty.call( symbols.hashTable, token ) === false ) {
symbols.
newCount: 0,
oldCount: 1,
newToken: null,
oldToken: j
} );
}
Line 1,690 ⟶ 1,723:
}
//
if ( up === false ) {
j = this.oldText.tokens[j].next;
Line 1,711 ⟶ 1,744:
var newToken = symbols.token[i].newToken;
var oldToken = symbols.token[i].oldToken;
var newTokenObj = this.newText.tokens[newToken];
var oldTokenObj = this.oldText.tokens[oldToken];
// Connect from new to old and from old to new
if (
// Do not use spaces as unique markers
if (
this.config.regExp.blankOnlyToken.test(
) {
// Link new
symbols.linked = true;
//
bordersDown.push( [newToken, oldToken] );
bordersUp.push( [newToken, oldToken] );
Line 1,736 ⟶ 1,771:
}
else {
var token =
var words =
( token.match( this.config.regExp.countWords ) || [] ).
( token.match( this.config.regExp.countChunks ) || [] )
);
// Unique if longer than min block length
if ( wordsLength >= this.config.blockMinLength ) {
unique = true;
}
Line 1,748 ⟶ 1,785:
// Unique if it contains at least one unique word
else {
for ( var
if (
this.oldText.
this.newText.
Object.prototype.hasOwnProperty.call( this.oldText.words, word ) === true &&
Object.prototype.hasOwnProperty.call( this.newText.words, word ) === true
) {
unique = true;
Line 1,763 ⟶ 1,802:
// Set unique
if ( unique === true ) {
}
}
Line 1,924 ⟶ 1,963:
}
//
if ( recursionLevel === 0 && repeating === false ) {
this.bordersDown = bordersDownNext;
Line 1,930 ⟶ 1,969:
}
//
else {
this.bordersDown = this.bordersDown.concat( bordersDownNext );
Line 2,065 ⟶ 2,104:
// Set longest sequence of increasing groups in sections as fixed (not moved)
this.setFixed();
// Convert groups to insertions/deletions if maximum block length is too short
// Only for more complex texts that actually have blocks of minimum block length
var unlinkCount = 0;
if (
this.config.unlinkBlocks === true &&
this.config.blockMinLength > 0 &&
this.maxWords >= this.config.blockMinLength
) {
if ( this.config.timer === true ) {
this.time( '
}
Line 2,094 ⟶ 2,132:
// Repeat block detection from start
this.maxWords = 0;
this.getSameBlocks();
this.getSections();
Line 2,101 ⟶ 2,140:
}
if ( this.config.timer === true ) {
this.timeEnd( '
}
}
Line 2,139 ⟶ 2,178:
*/
this.getSameBlocks = function () {
if ( this.config.timer === true ) {
this.time( 'getSameBlocks' );
}
var blocks = this.blocks;
Line 2,166 ⟶ 2,209:
var text = '';
while ( i !== null && j !== null && this.oldText.tokens[j].link === i ) {
count ++;
if ( this.newText.tokens[i].unique === true ) {
unique = true;
}
i = this.newText.tokens[i].next;
j = this.oldText.tokens[j].next;
Line 2,206 ⟶ 2,248:
for ( var block = 0; block < blocksLength; block ++ ) {
blocks[block].newBlock = block;
}
if ( this.config.timer === true ) {
this.timeEnd( 'getSameBlocks' );
}
return;
Line 2,219 ⟶ 2,265:
*/
this.getSections = function () {
if ( this.config.timer === true ) {
this.time( 'getSections' );
}
var blocks = this.blocks;
Line 2,264 ⟶ 2,314:
block = sectionEnd;
}
}
if ( this.config.timer === true ) {
this.timeEnd( 'getSections' );
}
return;
Line 2,276 ⟶ 2,329:
*/
this.getGroups = function () {
if ( this.config.timer === true ) {
this.time( 'getGroups' );
}
var blocks = this.blocks;
Line 2,346 ⟶ 2,403:
} );
block = groupEnd;
// Set global word count of longest linked block
if ( maxWords > this.maxWords ) {
this.maxWords = maxWords;
}
}
}
if ( this.config.timer === true ) {
this.timeEnd( 'getGroups' );
}
return;
Line 2,360 ⟶ 2,425:
*/
this.setFixed = function () {
if ( this.config.timer === true ) {
this.time( 'setFixed' );
}
var blocks = this.blocks;
Line 2,399 ⟶ 2,468:
}
}
}
if ( this.config.timer === true ) {
this.timeEnd( 'setFixed' );
}
return;
Line 2,455 ⟶ 2,527:
return returnObj;
};
/**
* Convert matching '=' blocks in groups into insertion/deletion ('+'/'-') pairs
* if too short and too common.
* Prevents fragmentated diffs for very different versions.
*
* @param[in] array blocks Blocks table object
* @param[in/out] WikEdDiffText newText, oldText Text object, linked property
* @param[in/out] array groups Groups table object
* @return bool True if text tokens were unlinked
*/
this.unlinkBlocks = function () {
var blocks = this.blocks;
var groups = this.groups;
// Cycle through groups
var unlinked = false;
var groupsLength = groups.length;
for ( var group = 0; group < groupsLength; group ++ ) {
var blockStart = groups[group].blockStart;
var blockEnd = groups[group].blockEnd;
// Unlink whole group if no block is at least blockMinLength words long and unique
if ( groups[group].maxWords < this.config.blockMinLength && groups[group].unique === false ) {
for ( var block = blockStart; block <= blockEnd; block ++ ) {
if ( blocks[block].type === '=' ) {
this.unlinkSingleBlock( blocks[block] );
unlinked = true;
}
}
}
// Otherwise unlink block flanks
else {
// Unlink blocks from start
for ( var block = blockStart; block <= blockEnd; block ++ ) {
if ( blocks[block].type === '=' ) {
// Stop unlinking if more than one word or a unique word
if ( blocks[block].words > 1 || blocks[block].unique === true ) {
break;
}
this.unlinkSingleBlock( blocks[block] );
unlinked = true;
blockStart = block;
}
}
// Unlink blocks from end
for ( var block = blockEnd; block > blockStart; block -- ) {
if ( blocks[block].type === '=' ) {
// Stop unlinking if more than one word or a unique word
if (
blocks[block].words > 1 ||
( blocks[block].words === 1 && blocks[block].unique === true )
) {
break;
}
this.unlinkSingleBlock( blocks[block] );
unlinked = true;
}
}
}
}
return unlinked;
};
/**
* Unlink text tokens of single block, convert them into into insertion/deletion ('+'/'-') pairs.
*
* @param[in] array blocks Blocks table object
* @param[out] WikEdDiffText newText, oldText Text objects, link property
*/
this.unlinkSingleBlock = function ( block ) {
// Cycle through old text
var j = block.oldStart;
for ( var count = 0; count < block.count; count ++ ) {
// Unlink tokens
this.newText.tokens[ this.oldText.tokens[j].link ].link = null;
this.oldText.tokens[j].link = null;
j = this.oldText.tokens[j].next;
}
return;
};
Line 2,465 ⟶ 2,628:
*/
this.getDelBlocks = function () {
if ( this.config.timer === true ) {
this.time( 'getDelBlocks' );
}
var blocks = this.blocks;
Line 2,512 ⟶ 2,679:
}
}
}
if ( this.config.timer === true ) {
this.timeEnd( 'getDelBlocks' );
}
return;
Line 2,533 ⟶ 2,703:
*/
this.positionDelBlocks = function () {
if ( this.config.timer === true ) {
this.time( 'positionDelBlocks' );
}
var blocks = this.blocks;
Line 2,628 ⟶ 2,802:
this.sortBlocks();
if ( this.config.timer === true ) {
this.timeEnd( 'positionDelBlocks' );
}
return;
Line 2,729 ⟶ 2,816:
*/
this.getInsBlocks = function () {
if ( this.config.timer === true ) {
this.time( 'getInsBlocks' );
}
var blocks = this.blocks;
Line 2,776 ⟶ 2,867:
this.sortBlocks();
if ( this.config.timer === true ) {
this.timeEnd( 'getInsBlocks' );
}
return;
};
Line 2,825 ⟶ 2,919:
*/
this.setInsGroups = function () {
if ( this.config.timer === true ) {
this.time( 'setInsGroups' );
}
var blocks = this.blocks;
Line 2,865 ⟶ 2,963:
} );
}
}
if ( this.config.timer === true ) {
this.timeEnd( 'setInsGroups' );
}
return;
Line 2,892 ⟶ 2,993:
*/
this.insertMarks = function () {
if ( this.config.timer === true ) {
this.time( 'insertMarks' );
}
var blocks = this.blocks;
Line 3,025 ⟶ 3,130:
this.sortBlocks();
if ( this.config.timer === true ) {
this.timeEnd( 'insertMarks' );
}
return;
};
Line 3,522 ⟶ 3,630:
// Remove split element
fragments.splice( fragment, 1 );
fragmentsLength --;
// Add left text to fragments list
if ( rangeLeft !== null ) {
fragments.splice( fragment ++, 0, { text: textLeft, type: '=', color: null } );
fragmentsLength ++;
if ( omittedLeft !== null ) {
fragments.splice( fragment ++, 0, { text: '', type: omittedLeft, color: null } );
fragmentsLength ++;
}
}
Line 3,536 ⟶ 3,647:
fragments.splice( fragment ++, 0, { text: '', type: ',', color: null } );
fragments.splice( fragment ++, 0, { text: '', type: '[', color: null } );
fragmentsLength += 3;
}
Line 3,542 ⟶ 3,654:
if ( omittedRight !== null ) {
fragments.splice( fragment ++, 0, { text: '', type: omittedRight, color: null } );
fragmentsLength ++;
}
fragments.splice( fragment ++, 0, { text: textRight, type: '=', color: null } );
fragmentsLength ++;
}
}
Line 4,055 ⟶ 4,169:
*
* @param string label Timer label
* @param[out] array timer Current time in milliseconds (float)
*/
this.time = function ( label ) {
Line 4,071 ⟶ 4,185:
* @param string label Timer label
* @param bool noLog Do not log result
* @return float Time in milliseconds
*/
this.timeEnd = function ( label, noLog ) {
Line 4,082 ⟶ 4,196:
this.timer[label] = undefined;
if ( noLog !== true ) {
console.log( label + ': ' + diff.toFixed( 2 ) + ' ms' );
}
}
Line 4,109 ⟶ 4,223:
var timerLength = this.recursionTimer.length;
for ( var i = 0; i < timerLength; i ++ ) {
console.log( text + ' recursion ' + i + ': ' + this.recursionTimer[i].toFixed( 2 ) + ' ms
}
}
Line 4,251 ⟶ 4,365:
this.text = text.replace( /\r\n?/g, '\n');
//
if ( this.parent.config.timer === true ) {
this.parent.time( 'wordParse' );
Line 4,273 ⟶ 4,387:
this.wordParse = function ( regExp ) {
var regExpMatch = this.text.match( regExp );
var
if ( Object.prototype.hasOwnProperty.call( this.words, word ) === false ) {
this.words[word] = 1;
}
else {
this.words[word] ++;
}
}
}
Line 4,288 ⟶ 4,405:
/**
* Split text into paragraph, line, sentence, chunk, word, or character tokens.
*
* @param string level Level of splitting: paragraph, line, sentence, chunk, word, or character
* @param int|null token Index of token to be split, otherwise uses full text
* @param[in] string text Full text to be split
Line 4,319 ⟶ 4,436:
var regExpMatch;
var lastIndex = 0;
while ( ( regExpMatch = regExp.exec( text ) ) !== null ) {
if ( regExpMatch.index > lastIndex ) {
split.push( text.substring( lastIndex, regExpMatch.index ) );
}
split.push( regExpMatch[0] );
lastIndex =
}
if ( lastIndex < text.length ) {
Line 4,335 ⟶ 4,453:
// Insert current item, link to previous
this.tokens
token: split[i],
prev: prev,
Line 4,342 ⟶ 4,460:
number: null,
unique: false
} );
number ++;
Line 4,389 ⟶ 4,507:
* Split unique unmatched tokens into smaller tokens.
*
* @param string level Level of splitting: line, sentence, chunk, or word
* @param[in] array tokens Tokens list
*/
Line 4,396 ⟶ 4,514:
// Cycle through tokens list
var i = this.first;
while ( i
// Refine unique unmatched tokens into smaller tokens
Line 4,418 ⟶ 4,536:
var number = 0;
var i = this.first;
while ( i
this.tokens[i].number = number;
number ++;
Line 4,440 ⟶ 4,558:
dump += '\ni \tlink \t(prev \tnext) \tuniq \t#num \t"token"\n';
var i = this.first;
while ( i
dump +=
i + ' \t' + tokens[i].link + ' \t(' + tokens[i].prev + ' \t' + tokens[i].next + ') \t' +
|