Revision as of 17:54, 14 October 2014 view source Cacycle (talk \| contribs) Extended confirmed users 21,997 edits 1.2.1a (October 14, 2014) fix CSS, fix 'one word became separated' ← Previous edit		Revision as of 20:35, 16 October 2014 view source Cacycle (talk \| contribs) Extended confirmed users 21,997 edits 1.2.2 (October 146, 2014) added new line split, sped-up word regexp and wordParse(), fixed slide gaps, more timers, freed memory of words arrays, cleaned-up code Next edit →
Line 3: // ==UserScript== // @name wikEd diff // @version 1.2.1a2 // @date October 1416, 2014 // @description improved word-based diff library with block move detection // @homepage https://en.wikipedia.org/wiki/User:Cacycle/diff Line 468: // Split into paragraphs, after double newlines 'paragraph': new RegExp( '~~(.\|\\n)?(~~(\\r\\n\|\\n\|\\r){2,}\|[' + this.config.regExpNewParagraph + '])+', 'g' ), // Split into ~~sentences /[^ \n][^\n]?[.!?;]+(?=[ \n]\|$)\|\r\n\|\n\|\r/~~lines 'line': new RegExp( '\\r\\n\|\\n\|\\r\|[' + this.config.regExpNewLinesAll + ']', 'g' ), // Split into sentences /[^ ].?[.!?:;]+(?= \|$)/ 'sentence': new RegExp( '[^' + this.config.regExpBlanks + ~~this~~'].~~config~~?[.~~regExpNewLinesAll~~!?:;' + ~~'][^' +~~ ~~this.config.regExpNewLinesAll +~~ ~~']?[.!?;' +~~ this.config.regExpFullStops + this.config.regExpExclamationMarks + Line 487 ⟶ 492: ']+(?=[' + this.config.regExpBlanks + ']\|$)', ~~this.config.regExpNewLinesAll +~~ ~~']\|$)\|[' +~~ ~~this.config.regExpNewLines +~~ ~~']\|\\r\\n\|\\n\|\\r',~~ 'g' ), Line 507 ⟶ 509: // Split into words, multi-char markup, and chars // regExpLetters speed-up: \\w+ 'word': new RegExp( '(\\w+\|[_' + this.config.regExpLetters + '])+([\'’_’]?[_' + this.config.regExpLetters + ']+)\|\\[\\[\|\\]\\]\|\\{\\{\|\\}\\}\|&\\w+;\|\'\'\'\|\'\'\|==+\|\\{\\\|\|\\\|\\}\|\\\|-\|.', 'g' ), Line 544 ⟶ 547: // RegExps for counting words 'countWords': new RegExp( '(\\w+\|[_' + this.config.regExpLetters + '])+([\'’_’]?[_' + this.config.regExpLetters + ']+ )', 'g' ), Line 993 ⟶ 996: // Split new and old text into paragraps if ( this.config.timer === true ) { this.time( 'paragraph split' ); } this.newText.splitText( 'paragraph' ); this.oldText.splitText( 'paragraph' ); if ( this.config.timer === true ) { this.timeEnd( 'paragraph split' ); } // Calculate diff this.calculateDiff( '~~paragraph~~line' ); // Refine different paragraphs into ~~sentences~~lines if ( this.config.timer === true ) { this.time( 'line split' ); } this.newText.splitRefine( 'line' ); this.oldText.splitRefine( 'line' ); if ( this.config.timer === true ) { this.timeEnd( 'line split' ); } // Calculate refined diff this.calculateDiff( 'line' ); // Refine different lines into sentences if ( this.config.timer === true ) { this.time( 'sentence split' ); } this.newText.splitRefine( 'sentence' ); this.oldText.splitRefine( 'sentence' ); if ( this.config.timer === true ) { this.timeEnd( 'sentence split' ); } // Calculate refined diff this.calculateDiff( 'sentence' ); // Refine different ~~paragraphs~~sentences into chunks if ( this.config.timer === true ) { this.time( 'chunk split' ); Line 1,019 ⟶ 1,047: this.calculateDiff( 'chunk' ); // Refine different ~~sentences~~chunks into words if ( this.config.timer === true ) { this.time( 'word split' ); Line 1,068 ⟶ 1,096: } // ~~free~~Free memory this.symbols = undefined; this.bordersDown = undefined; this.bordersUp = undefined; this.newText.words = undefined; this.oldText.words = undefined; // Enumerate token lists Line 1,086 ⟶ 1,116: } // ~~free~~Free memory this.newText.tokens = undefined; this.oldText.tokens = undefined; Line 1,093 ⟶ 1,123: this.getDiffFragments(); // ~~free~~Free memory this.blocks = undefined; this.groups = undefined; Line 1,184 ⟶ 1,214: var i = this.newText.first; var j = this.oldText.first; while ( i ~~!== null && this.newText.tokens[i]~~ !== null ) { // Get token links Line 1,339 ⟶ 1,369: if ( left < shorterToken.length / 2 && (right < shorterToken.length / 2) ) { // Do not split into chars in this gap charSplit = false; break; Line 1,435 ⟶ 1,465: / this.slideGaps = function ( text, textLinked ) { var regExpSlideBorder = this.config.regExp.slideBorder; var regExpSlideStop = this.config.regExp.slideStop; // Cycle through tokens list var i = text.first; var gapStart = null; while ( i ~~!== null && text.tokens[i]~~ !== null ) { // Remember gap start Line 1,475 ⟶ 1,508: var front = text.tokens[gapFront].prev; var back = gapBack; var gapFrontBlankTest = ~~this.config.regExp.slideBorder~~regExpSlideBorder.test( text.tokens[gapFront].token ); var frontStop = front; if ( text.tokens[back].link === null ) { Line 1,484 ⟶ 1,517: text.tokens[front].token === text.tokens[back].token ) { ~~front = text.tokens[front].prev;~~ ~~back = text.tokens[back].prev;~~ if ( front !== null ) { // Stop at line break if ( ~~this.config.regExp.slideStop~~regExpSlideStop.test( text.tokens[front].token ) === true ) { frontStop = front; break; Line 1,496 ⟶ 1,527: // Stop at first word border (blank/word or word/blank) if ( ~~this.config.regExp.slideBorder~~regExpSlideBorder.test( text.tokens[front].token ) !== gapFrontBlankTest ) { ~~gapFrontBlankTest~~ ~~) {~~ frontStop = front; } } front = text.tokens[front].prev; back = text.tokens[back].prev; } } Line 1,545 ⟶ 1,576: * * @param array symbols Symbol table object * @param string level Split level: 'paragraph', 'line', 'sentence', 'chunk', 'word', or 'character' * * Optionally for recursive or repeated calls: Line 1,612 ⟶ 1,643: // Cycle through new text tokens list var i = newStart; while ( i ~~!== null && this.newText.tokens[i]~~ !== null ) { if ( this.newText.tokens[i].link === null ) { Line 1,618 ⟶ 1,649: var token = this.newText.tokens[i].token; if ( Object.prototype.hasOwnProperty.call( symbols.hashTable, token ) === false ) { ~~var current~~symbols.hashTable[token] = symbols.token.length; symbols.~~hashTable[~~token].push( ~~= current;~~{ ~~symbols.token[current] = {~~ newCount: 1, oldCount: 0, newToken: i, oldToken: null } ); } Line 1,657 ⟶ 1,687: // Cycle through old text tokens list var j = oldStart; while ( j ~~!== null && this.oldText.tokens[j]~~ !== null ) { if ( this.oldText.tokens[j].link === null ) { Line 1,663 ⟶ 1,693: var token = this.oldText.tokens[j].token; if ( Object.prototype.hasOwnProperty.call( symbols.hashTable, token ) === false ) { ~~var current~~symbols.hashTable[token] = symbols.token.length; symbols.~~hashTable[~~token].push( ~~= current;~~{ ~~symbols.token[current] = {~~ newCount: 0, oldCount: 1, newToken: null, oldToken: j } ); } Line 1,711 ⟶ 1,740: var newToken = symbols.token[i].newToken; var oldToken = symbols.token[i].oldToken; var newTokenObj = this.newText.tokens[newToken]; var oldTokenObj = this.oldText.tokens[oldToken]; // Connect from new to old and from old to new if ( ~~this.newText.tokens[newToken]~~newTokenObj.link === null ) { // Do not use spaces as unique markers if ( this.config.regExp.blankOnlyToken.test( ~~this.newText.tokens[newToken]~~newTokenObj.token ) === true ) { // Link new anand old tokens ~~this.newText.tokens[newToken]~~newTokenObj.link = oldToken; ~~this.oldText.tokens[oldToken]~~oldTokenObj.link = newToken; symbols.linked = true; Line 1,736 ⟶ 1,767: } else { var token = ~~this.newText.tokens[newToken]~~newTokenObj.token; var words = ( token.match( this.config.regExp.countWords ) \|\| [] ).length + Line 1,763 ⟶ 1,794: // Set unique if ( unique === true ) { ~~this.newText.tokens[newToken]~~newTokenObj.unique = true; ~~this.oldText.tokens[oldToken]~~oldTokenObj.unique = true; } } Line 2,065 ⟶ 2,096: // Set longest sequence of increasing groups in sections as fixed (not moved) ~~if ( this.config.timer === true ) {~~ ~~this.time( 'setFixed' );~~ } this.setFixed(); ~~if ( this.config.timer === true ) {~~ ~~this.time( 'setFixed' );~~ } // Convert groups to insertions/deletions if maximum block length is too short Line 2,077 ⟶ 2,102: if ( this.config.unlinkBlocks === true && this.config.blockMinLength > 0 ) { if ( this.config.timer === true ) { this.time( '~~unlink~~total unlinking' ); } Line 2,101 ⟶ 2,126: } if ( this.config.timer === true ) { this.timeEnd( '~~unlink~~total unlinking' ); } } Line 2,139 ⟶ 2,164: / this.getSameBlocks = function () { if ( this.config.timer === true ) { this.time( 'getSameBlocks' ); } var blocks = this.blocks; Line 2,166 ⟶ 2,195: var text = ''; while ( i !== null && j !== null && this.oldText.tokens[j].link === i ) { ~~var token~~text += this.oldText.tokens[j].token; count ++; if ( this.newText.tokens[i].unique === true ) { unique = true; } ~~text += token;~~ i = this.newText.tokens[i].next; j = this.oldText.tokens[j].next; Line 2,206 ⟶ 2,234: for ( var block = 0; block < blocksLength; block ++ ) { blocks[block].newBlock = block; } if ( this.config.timer === true ) { this.timeEnd( 'getSameBlocks' ); } return; Line 2,219 ⟶ 2,251: / this.getSections = function () { if ( this.config.timer === true ) { this.time( 'getSections' ); } var blocks = this.blocks; Line 2,264 ⟶ 2,300: block = sectionEnd; } } if ( this.config.timer === true ) { this.timeEnd( 'getSections' ); } return; Line 2,276 ⟶ 2,315: / this.getGroups = function () { if ( this.config.timer === true ) { this.time( 'getGroups' ); } var blocks = this.blocks; Line 2,347 ⟶ 2,390: block = groupEnd; } } if ( this.config.timer === true ) { this.timeEnd( 'getGroups' ); } return; Line 2,360 ⟶ 2,406: / this.setFixed = function () { if ( this.config.timer === true ) { this.time( 'setFixed' ); } var blocks = this.blocks; Line 2,399 ⟶ 2,449: } } } if ( this.config.timer === true ) { this.timeEnd( 'setFixed' ); } return; Line 2,455 ⟶ 2,508: return returnObj; }; /** * Convert matching '=' blocks in groups into insertion/deletion ('+'/'-') pairs * if too short and too common. * Prevents fragmentated diffs for very different versions. * * @param[in] array blocks Blocks table object * @param[in/out] WikEdDiffText newText, oldText Text object, linked property * @param[in/out] array groups Groups table object * @return bool True if text tokens were unlinked / this.unlinkBlocks = function () { var blocks = this.blocks; var groups = this.groups; // Cycle through groups var unlinked = false; var groupsLength = groups.length; for ( var group = 0; group < groupsLength; group ++ ) { var blockStart = groups[group].blockStart; var blockEnd = groups[group].blockEnd; // Unlink whole group if no block is at least blockMinLength words long and unique if ( groups[group].maxWords < this.config.blockMinLength && groups[group].unique === false ) { for ( var block = blockStart; block <= blockEnd; block ++ ) { if ( blocks[block].type === '=' ) { this.unlinkSingleBlock( blocks[block] ); unlinked = true; } } } // Otherwise unlink block flanks else { // Unlink blocks from start for ( var block = blockStart; block <= blockEnd; block ++ ) { if ( blocks[block].type === '=' ) { // Stop unlinking if more than one word or a unique word if ( blocks[block].words > 1 \|\| blocks[block].unique === true ) { break; } this.unlinkSingleBlock( blocks[block] ); unlinked = true; blockStart = block; } } // Unlink blocks from end for ( var block = blockEnd; block > blockStart; block -- ) { if ( blocks[block].type === '=' ) { // Stop unlinking if more than one word or a unique word if ( blocks[block].words > 1 \|\| ( blocks[block].words === 1 && blocks[block].unique === true ) ) { break; } this.unlinkSingleBlock( blocks[block] ); unlinked = true; } } } } return unlinked; }; /* * Unlink text tokens of single block, convert them into into insertion/deletion ('+'/'-') pairs. * * @param[in] array blocks Blocks table object * @param[out] WikEdDiffText newText, oldText Text objects, link property / this.unlinkSingleBlock = function ( block ) { // Cycle through old text var j = block.oldStart; for ( var count = 0; count < block.count; count ++ ) { // Unlink tokens this.newText.tokens[ this.oldText.tokens[j].link ].link = null; this.oldText.tokens[j].link = null; j = this.oldText.tokens[j].next; } return; }; Line 2,465 ⟶ 2,609: / this.getDelBlocks = function () { if ( this.config.timer === true ) { this.time( 'getDelBlocks' ); } var blocks = this.blocks; Line 2,512 ⟶ 2,660: } } } if ( this.config.timer === true ) { this.timeEnd( 'getDelBlocks' ); } return; Line 2,533 ⟶ 2,684: / this.positionDelBlocks = function () { if ( this.config.timer === true ) { this.time( 'positionDelBlocks' ); } var blocks = this.blocks; Line 2,628 ⟶ 2,783: this.sortBlocks(); if ( this.config.timer === true ) { ~~return;~~ this.timeEnd( 'positionDelBlocks' ); }; /* * Convert matching '=' blocks in groups into insertion/deletion ('+'/'-') pairs * if too short and too common. * Prevents fragmentated diffs for very different versions. * * @param[in] array blocks Blocks table object * @param[in/out] WikEdDiffText newText, oldText Text object, linked property * @param[in/out] array groups Groups table object * @return bool True if text tokens were unlinked / ~~this.unlinkBlocks = function () {~~ ~~var blocks = this.blocks;~~ ~~var groups = this.groups;~~ ~~// Cycle through groups~~ ~~var unlinked = false;~~ ~~var groupsLength = groups.length;~~ ~~for ( var group = 0; group < groupsLength; group ++ ) {~~ ~~var blockStart = groups[group].blockStart;~~ ~~var blockEnd = groups[group].blockEnd;~~ ~~// Unlink whole group if no block is at least blockMinLength words long and unique~~ ~~if ( groups[group].maxWords < this.config.blockMinLength && groups[group].unique === false ) {~~ ~~for ( var block = blockStart; block <= blockEnd; block ++ ) {~~ ~~if ( blocks[block].type === '=' ) {~~ ~~this.unlinkSingleBlock( blocks[block] );~~ ~~unlinked = true;~~ } } } ~~// Otherwise unlink block flanks~~ ~~else {~~ ~~// Unlink blocks from start~~ ~~for ( var block = blockStart; block <= blockEnd; block ++ ) {~~ ~~if ( blocks[block].type === '=' ) {~~ ~~// Stop unlinking if more than one word or a unique word~~ ~~if ( blocks[block].words > 1 \|\| blocks[block].unique === true ) {~~ ~~break;~~ } ~~this.unlinkSingleBlock( blocks[block] );~~ ~~unlinked = true;~~ ~~blockStart = block;~~ } } ~~// Unlink blocks from end~~ ~~for ( var block = blockEnd; block > blockStart; block -- ) {~~ ~~if ( blocks[block].type === '=' ) {~~ ~~// Stop unlinking if more than one word or a unique word~~ ~~if (~~ ~~blocks[block].words > 1 \|\|~~ ~~( blocks[block].words === 1 && blocks[block].unique === true )~~ ~~) {~~ ~~break;~~ } ~~this.unlinkSingleBlock( blocks[block] );~~ ~~unlinked = true;~~ } } } } ~~return unlinked;~~ }; /* * Unlink text tokens of single block, convert them into into insertion/deletion ('+'/'-') pairs. * * @param[in] array blocks Blocks table object * @param[out] WikEdDiffText newText, oldText Text objects, link property / ~~this.unlinkSingleBlock = function ( block ) {~~ ~~// Cycle through old text~~ ~~var j = block.oldStart;~~ ~~for ( var count = 0; count < block.count; count ++ ) {~~ ~~// Unlink tokens~~ ~~this.newText.tokens[ this.oldText.tokens[j].link ].link = null;~~ ~~this.oldText.tokens[j].link = null;~~ ~~j = this.oldText.tokens[j].next;~~ } return; Line 2,729 ⟶ 2,797: / this.getInsBlocks = function () { if ( this.config.timer === true ) { this.time( 'getInsBlocks' ); } var blocks = this.blocks; Line 2,776 ⟶ 2,848: this.sortBlocks(); if ( this.config.timer === true ) { this.timeEnd( 'getInsBlocks' ); } return; }; Line 2,825 ⟶ 2,900: / this.setInsGroups = function () { if ( this.config.timer === true ) { this.time( 'setInsGroups' ); } var blocks = this.blocks; Line 2,865 ⟶ 2,944: } ); } } if ( this.config.timer === true ) { this.timeEnd( 'setInsGroups' ); } return; Line 2,892 ⟶ 2,974: / this.insertMarks = function () { if ( this.config.timer === true ) { this.time( 'insertMarks' ); } var blocks = this.blocks; Line 3,025 ⟶ 3,111: this.sortBlocks(); if ( this.config.timer === true ) { this.timeEnd( 'insertMarks' ); } return; }; Line 4,055 ⟶ 4,144: * * @param string label Timer label * @param[out] array timer Current time in milliseconds (float) / this.time = function ( label ) { Line 4,071 ⟶ 4,160: @param string label Timer label * @param bool noLog Do not log result * @return float Time in milliseconds~~, rounded to two decimal digits~~ */ this.timeEnd = function ( label, noLog ) { Line 4,082 ⟶ 4,171: this.timer[label] = undefined; if ( noLog !== true ) { console.log( label + ': ' + diff.toFixed( 2 ) + ' ms' ); } } Line 4,109 ⟶ 4,198: var timerLength = this.recursionTimer.length; for ( var i = 0; i < timerLength; i ++ ) { console.log( text + ' recursion ' + i + ': ' + this.recursionTimer[i].toFixed( 2 ) + ' ms\n' ); } } Line 4,273 ⟶ 4,362: this.wordParse = function ( regExp ) { var words = this.text.match( regExp ); ~~var regExpMatch;~~ ~~while~~if ( ~~( regExpMatch = regExp.exec( this.text ) )~~words !== null ) { var ~~word~~wordsLength = ~~regExpMatch[0]~~words.length; iffor (var ~~this.words[word]~~i === ~~undefined~~0; i < wordsLength; i ++) { var wordCounter = this.words[~~word~~ words[i] ~~= 1~~]; if ( wordCounter === undefined ) { } wordCounter = 1; ~~else {~~ } ~~this.words[word] ++;~~ else { wordCounter ++; } } } Line 4,319 ⟶ 4,411: var regExpMatch; var lastIndex = 0; ~~while~~var ~~( ( regExpMatch~~regExp = this.parent.config.regExp.split[level]~~.exec( text ) ) !== null ) {~~; while ( ( regExpMatch = regExp.exec( text ) ) !== null ) { if ( regExpMatch.index > lastIndex ) { split.push( text.substring( lastIndex, regExpMatch.index ) ); } split.push( regExpMatch[0] ); lastIndex = ~~this.parent.config.~~regExp~~.split[level]~~.lastIndex; } if ( lastIndex < text.length ) { Line 4,335 ⟶ 4,428: // Insert current item, link to previous this.tokens~~[current] =~~.push( { token: split[i], prev: prev, Line 4,342 ⟶ 4,435: number: null, unique: false } ); number ++; Line 4,396 ⟶ 4,489: // Cycle through tokens list var i = this.first; while ( i ~~!== null && this.tokens[i]~~ !== null ) { // Refine unique unmatched tokens into smaller tokens Line 4,418 ⟶ 4,511: var number = 0; var i = this.first; while ( i ~~!== null && this.tokens[i]~~ !== null ) { this.tokens[i].number = number; number ++; Line 4,440 ⟶ 4,533: dump += '\ni \tlink \t(prev \tnext) \tuniq \t#num \t"token"\n'; var i = this.first; while ( i ~~!== null && tokens[i]~~ !== null ) { dump += i + ' \t' + tokens[i].link + ' \t(' + tokens[i].prev + ' \t' + tokens[i].next + ') \t' +

User:Cacycle/diff.js: Difference between revisions