User:Ohconfucius/script/Sources.js: Difference between revisions

Content deleted Content added
aligning to latest version of test script
a few more to fully align
Line 34:
txt.value=txt.value.replace(/(\|\s?(?:author(?:link\d?|)|journal|newspaper|publisher|website|work)\s*\=\s*)(?:https?:\/\/|)www\.(\w)/gi, '$1$2'); //leave only ___domain name
txt.value=txt.value.replace(/(\|\s?(?:newspaper|work|journal|publisher)\s*\=\s*)\[https?:\/\/[^\s\]]*\s([\w][^\]]*)\]/gi, '$1$2');
 
txt.value=txt.value.replace(/(\|\s?author(?:link\d?|)\s*\=\s*)\[https?:\/\/[^\s\]]*\s([\w][^\]]*)\]/gi, '$1$2');
txt.value=txt.value.replace(/(\|\s?author(?:link\d?|)\s*\=\s*)(?:https?:\/\/|)www\.[\w][^|}]*(?=[|}\n])/gi, '$1'); //rem outright (not a WL)
 
// removing references to other WP articles and 'external' WP links
regex(/<ref[^<>]*>[^<>]*\|[ ]*url ?=https?:\/\/(?:\w{2}\.wikipedia\.org\/wiki|(?:www\.|)(?:facebook|myspace|twitter)\.com)\/[^<>]*<\/ref>/gi, '{{cn}}');
Line 43:
regex(/<ref>\[https?:\/\/(?:\w{2}\.wikipedia\.org\/wiki|(?:www\.|)(?:facebook|myspace)\.com)\/[^\s\]]*[ ]+[\w\d][^\]]*\]<\/ref>/gi, '{{cn}}');
regex(/\|[ ]*url[ ]*=[ ]*https?:\/\/(?:\w{2}\.wikipedia\.org\/wiki|(?:www\.|)(?:facebook|myspace)\.com)[^\s\|\{\}<]*(?=[ ]*[|}])/gi, '');
 
regex(/[ ]\[https?:\/\/\w{2}\.wikipedia\.org\/wiki\/[^\s\]]*[ ]+([\w][^\]]*)\]/gi, ' [[$1]]');
regex(/(\|\s?url\s*\=\s*|\[)(https?:[^|{}#\s]+)#[A-Za-z0-9\.]{12,13}(?=[\s\[\]|{}<>])/gi, '$1$2'); //rem link tracking
regex(/(\|\s?url\s*\=\s*|\[)(https?:\/\/books\.google\.[^\/]+\/books\?id=\w{12}&pg=PA\d{1,3})&dq[^\s|}]+(?=\s?[|}])/gi, '$1$2'); //rem browser optimisation
 
 
}
 
function Ohc_sources_prep() {
var txt=document.editform.wpTextbox1;
 
// removing artefacts within fields
regex(/(\|\s?author\s*\=\s*)(?:by |)(?:wire staff|(?:staff |)reporters?|)[ ]*(?=[|}\n])/gi, '');
regex(/(\|\s?author\s*\=\s*)([A-Z][a-z]*(?: [A-Z][a-z]*)*) (?:wire staff|(?:staff |)reporters?)[ ]*(?=[|}\n])/gi, '$1$2');
regex(/\|[ ]*last=(Reporter|staff)[ ]*\|[ ]*first=[^|\{\}]*(?=[\|{}])=/gi, '');
regex(/(\|\s?accessdate\s*\=\s*)(?:accessed|retrieved)(?: by| on|):?[ ]*(\d)/gi, '$1$2');
regex(/(\|\s?volume\s*\=\s*)vol(?:ume|\.?)[ ]*(\d)/gi, '$1$2');
regex(/(\|\s?pages?\s*\=\s*)(?:pages?|p[gp]?\.?)[ ]*(\d)/gi, '$1$2');
 
//Remove COinS corrupting templates from CS1 citations
regex(/(\|\s?(?:authors?|first\d?|last\d?|publisher|work)\s*\=\s*(?:[^{}|]*|)){{(?:Sm|Aut|SC|Small[- ]caps|Sm?caps)\|([^{}|]*)}}(?=(?:[^{}|]*|)[|}])/gi, '$1');
Line 89:
regex(/(\|\s?title\s*\=\s*)\'&#39;([^\|\{\}]+)\'&#39;/gi, '$1$2'); //rem &#39; in titles
regex(/(\|\s?publisher\s*\=\s*)\(([^\|\{\}]+)\)/gi, '$1$2'); //rem parenthetical publishers
 
// reordering 'work' and 'publisher' (first run - see second run in cleanup function)
regex(/(\|\s?publisher\s*\=\s*(?:\[\[[^<{}\]]*\]\]|[^{}\|\}<>]*))(\s?\|[^}<>]*|)(\|\s?(?: journal|newspaper|magazine|periodical|website|work)\s*\=\s*(?:\[\[[^<{}\]]*\]\]|[^{}\|\}<>]*))(?=[\s\.]*[|}])/g, '$3$1$2');
regex(/(\|\s?website\s*\=\s*(?:\[\[[^<{}\]]*\]\]|[^{}\|\}<>]*))(\s?\|[^}<>]*|)(\|\s?(?: journal|newspaper|magazine|periodical|work)\s*\=\s*(?:\[\[[^<{}\]]*\]\]|[^{}\|\}<>]*))(?=[\s\.]*[|}])/g, '$3$1$2');
 
/// removing identical/similar entries in 'work' and 'publisher', and in 'work' and 'website' (different default vs [post] cleanup rules)
regex(/\|\s?work\s*\=\s*([^=|}\[<>]*)(\|[^}<>]*|)\|\s?(?:publisher|website)\s*\=\s*(?:\1|\[\[\1\]\])\.?(?=\s*[|}])/g, '|publisher=$1$2'); //unlinked work
regex(/\|\s?work\s*\=\s*\[\[([^<|\]]*)\]\](\|[^}<>]*|)\|\s?(?:publisher|website)\s*\=\s*(?:\1|\[\[\1\]\])\.?(?=\s*[|}])/g, '|publisher=$1$2'); //unpiped work
regex(/\|\s?work\s*\=\s*(\[\[(?:[^<|\]]*)\|([^}<>]*)\]\])(\|[^}<>]*|)\|\s?(?:publisher|website)\s*\=\s*(\1|2)\.?(?=\s*[|}])/g, '|publisher=$1$3'); //piped work
 
regex(/\|\s?publisher\s*\=\s*([^=|}\[<>]*)(\|[^}<>]*|)\|\s?publisher\s*\=\s*(?:\1|\[\[\1\]\])\.?(?=\s*[|}])/g, '|publisher=$1$2'); //unlinked work
regex(/\|\s?publisher\s*\=\s*\[\[([^<|\]]*)\]\](\|[^}<>]*|)\|\s?publisher\s*\=\s*(?:\1|\[\[\1\]\])\.?(?=\s*[|}])/g, '|publisher=$1$2'); //unpiped work
regex(/\|\s?publisher\s*\=\s*(\[\[(?:[^<|\]]*)\|([^}<>]*)\]\])(\|[^}<>]*|)\|\s?publisher\s*\=\s*(\1|2)\.?(?=\s*[|}])/g, '|publisher=$1$3'); //piped work
// reordering 'work' and 'publisher' (first run - see second run in cleanup function)
regex(/(\|\s?publisher\s*\=\s*(?:\[\[[^<{}\]]*\]\]|[^{}|<>]*))(\s?\|[^}<>]*|)(\|\s?(?: journal|newspaper|magazine|periodical|website|work)\s*\=\s*(?:\[\[[^<{}\]]*\]\]|[^{}\|\}<>]*))(?=[\s\.]*[|}])/g, '$3$1$2');
 
// remove redundant parentheses and templates from dm and md dates (equivalents also exists in Mosnum script)
Line 118 ⟶ 119:
regex(/(\|\s?title\s*\=\s*)([^\|\}<>]*)(\s?\|[^}<>]*|)\|\s?(publisher|work)\s*\=\s*(?:\1|\[\[\1\]\])(?=\s*[|}])/g, '$1ACTUAL ARTICLE TITLE BELONGS HERE! |$4=$2$3');
regex(/(\|\s?title\s*\=\s*)(\w+\.com)(?=\s?[=|{}])/gi, '$1ACTUAL ARTICLE TITLE BELONGS HERE! |publisher=$2');
 
// rem misplaced punctuation
regex(/(<ref[^>]*>[^<]+?[\]\.\},;–]\s*\'\'[\w-]*(?: [\w-]*){0,3})(\.com|)([;,\.])(\'\')(?=[^<]*?<\/ref>)/gi, '$1$2$4$3$5');
regex(/([\w]+)\.(['"]\])[ ]/gi, '$1$2. '); //LQ for titles
 
Line 127 ⟶ 128:
 
//rem underlining within certain fields
txt.value=txt.value.replace(/(\|\s?(?:journal|newspaper|periodical|publisher|work)\s*\=\s*)<u>([^|}]*)<\/u>/gi, '$1$2');
 
//rem redundant top-level domains (.com, .net, .org), strip "www"
Line 133 ⟶ 134:
 
//rem duplicated publishers in separate fields (pre)
regex(/(?:[‒–—―]+|&#124;)\s*(?:The |)([^\|\}&]{3,})(?:\.com|)\s*(\|\s?(?:agency|publisher|work)\s*\=\s*(?:\w+\.|))\1(?=\s?*[|}])/gi, '$2$1');
regex(/(?:[‒–—―]+|&#124;)\s*([^\|\}&]{3,})(?:\.com|)\s*(\|\s?(?:agency|publisher|work)\s*\=\s*)(The |)\1(?=\s*[|}])/gi, '$2$3$1');
 
//'work' and its alias (pre)
regex(/(\|[ ]*?newspaper[ ]*=[^\|}]*(?:\|[^\{\}]*|))(?:\|[ ]*?work[ ]*=[^|}]*)(?=\s?[|}])+/gi, '$1');
Line 171 ⟶ 174:
// regex(/(\[\[)(?:foo|bar)(\|)/gi, '$1foo bar \(dab\)$2');
 
regex(/(\[\[)(?:(?:British|English|London) Sun|Sun on Sunday|The Scottish Sun|(?:The |)Sun (?:\((?:British |)newspaper\)|\(tabloid\)|\(UK newspaper\)|\(UK\)|Newspaper|on Sunday|Online)|Thesun\.co\.uk)(?=\|)/gi, '$1The Sun (United Kingdom)$2');
regex(/(\[\[)Daily Star \((?:British|UK)\)(?=\|)/gi, '$1Daily Star (United Kingdom)$2');
regex(/(\[\[Metro)(?: \(Associated Metro Limited\)| \(Associated Newspapers\)| \(London newspaper\)| \(free London newspaper\)| UK| newspaper London| newspaper UK)(?=\|)/gi, '$1 (British newspaper)$2');
regex(/(\[\[)(?:Calcutta Telegraph|The Telegraph \((?:kolkatt?a|India)\)|(?:The |)Telegraph India|Telegraphindia\.com)(?=\|)/gi, '$1The Telegraph (Calcutta)$2');
regex(/(\[\[)Dawn(?:, Karachi| newspaper|\.com| \((?:Newspaper|Pakistan)\))(?=\|)/gi, '$1Dawn (newspaper)$2');
regex(/(\[\[The Pioneer)(?:, Karachi| newspaper| \((?:indian newspaper)\))(?=\|)/gi, '$1 (Indian newspaper)$2');
regex(/(\[\[)dailypioneer.com(?=\|)/gi, '$1The Pioneer (Indian newspaper)$2');
regex(/(\|)(Sport \()(newspaper\))(?=\||\]\])/g, '$1$2Spanish $3'); //dab moved December 2012
regex(/(=[ ]*\[\[)(?:[BE]SPN ?(?:USA|HD|Network|the ocho|\(United States\))|E.S.P.N.|(?:The |)Entertainment (?:and |)Sports Programming Network)(?:\|[\w, ]*)(?=\]\])/gi, '$1ESPN$2');
 
regex(/(?:agency|journal|newspaper|periodical|publisher|work)(\s?=\s?\[\[)(?:MTV (?:[A-Z]\w*|\([^\)\]]*\)))\|[^\)\]]*(?=\]\])/gi, 'publisher$1MTV$2');
 
//unwinding of unnecessary pipes
regex(/(\[\[)Public Broadcasting Service\|(PBS\]\])/gi, '[[$1$2');
 
}
Line 204 ⟶ 207:
regex(/(?:\|\s?(newspaper|work|publisher)\s*\=\s*\[\[(ACP Magazines|The Herald and Weekly Times|John Fairfax (and Sons Ltd\.?|Holdings)|Fairfax(?: Media(?: Limited|)| Digital| newspapers))\]\])(?=[\s\.]*[|}])/gi, '');
regex(/(?:\|\s?publisher\s*\=\s*(Alexander Lebedev|American Media|Associated Newspapers|Cond[eé] Nast(?: Publications|)|Daily Mail and General Trust|Devin Laz[ae]rine|Dow Jones & Company|Future plc|(Guardian|Telegraph) Media Group|(?:Guardian|Independent) News (?:and|&) Media (?:Limited|Ltd\.|)|Hachette Filipacchi Médias|Hearst (?:Corporation|Magazines(?: UK|))|Herald Media|IGN Entertainment|Imdb Inc\.?|InterMedia Partners|IDG|IPC Media|Lee Enterprises|Media ?News Group|Mortimer Zuckerman|MTV Networks|News (?:Corporation|International|Limited)|Prometheus Global Media|Reed Business Information|Rovi Corporation|Trinity Mirror|Times Newspapers|Nielsen (?: Media Research|Business Media)|Viacom|Time(?: Warner ?|)))(,? Inc| LL[CP]| Ltd|Limited|)[\s\.]*(?=[|}\n])/gi, '');
regex(/\|\s?publisher\s*\=\s*(?:The |)(?:Deseret News Publishing|Dispatch Printing|E. W. Scripps|Evening Post Publishing|Forbes(?: Publishing|, Inc\.)|Gannett?|Irish Times Trust|(?:Jann Wenner|Wenner Media)|Johnson Publishing|Journal Communications|Mac Publishing|Media24|McClatchy|Nash holdings LLC|New York Times|Seattle Times|Star Tribune|Thomp?son(?:[- ]?Reuters)?(?: Corporation| Plc.?|)|Torstar|Time Inc\.|Times (?:Group|Publishing)|Tribune|Vox Media|Washington Post|World Publishing|Ziff Davis Media)(?: Co(?:mpany|\.)?)?(?=[\s\.]*[|}])/g, '');
regex(/\|\s?publisher\s*\=\s*(?:Cox|Halifax|North Jersey|Sun-Times|Tampa|Herald|Stephens|WEHCO|\w+) Media( Group(?:, Inc.)?)?(?=[\s\.]*[|}])/g, '');
regex(/\|\s?publisher\s*\=\s*(?:\w+ )+(?:Media|Publishing|Publications)(?: Group(?:, Inc.)?)?(?=[\s\.]*[|}])/g, ''); //rem "Communications" - false positive for "Ministry of Economic Affairs and Communications" reported 28 May 2014
Line 291 ⟶ 294:
regex(/(\|)(The Sun)(?: \((?:Hong Kong|Malaysia|Nigeria|United Kingdom)\))(\]\])/g, '$1$2$3');
regex(/(=[ ]*The Telegraph) \(Calcutta\)(?=\s*[|}])/g, '$1|___location=Kolkota');
regex(/(''The Telegraph) \((Calcutta)\)('')/g, '$1$3 (Calcutta$2)');
regex(/(\|)(The Telegraph)(?: \(Calcutta\))(\]\])/g, '$1$2$3');
regex(/(=[ ]*The Daily Telegraph) \(Australia\)(?=\s*[|}])/g, '$1|___location=Australia');
regex(/(''The Daily Telegraph) \((Australia)\)('')/g, '$1$3 (Australia)');
regex(/(\|)(The Daily Telegraph)(?: \(Australia\))(\]\])/g, '$1$2$3');
regex(/(=[ ]*(?:The Daily Times)) \((Malawi)\)(\s*[|}])/g, '$1|___location=$2$3');
regex(/(''(?:The Daily Times)) \((Malawi)\)('')/g, '$1$3 ($2)');
regex(/(\|)(The Daily Times)(?: \(Malawi\))(\]\])/g, '$1$2$3');
regex(/(=[ ]*(?:The Times)) \((Malta)\)(\s*[|}])/g, '$1|___location=$2$3');
regex(/(''(?:The Times)) \((Malta)\)('')/g, '$1$3 ($2)');
Line 309 ⟶ 315:
regex(/(''(?:Il Giorno)) \((newspaper)\)('')/g, '$1$3 (Italy)');
regex(/(\|)(Il Giorno)(?: \(newspaper\))(\]\])/g, '$1$2$3');
 
regex(/(= ?(?:RT)) \((TV network)\)([ ]*[|}])/g, '$1|___location=Russia$3');
regex(/(\|)(RT)(?: \(TV network\))(\]\])/g, '$1$2$3');
Line 326 ⟶ 332:
regex(/(.) – Google [^ \]]*(\][\.,;]) Books\.google\.\w{2,3}(\.| )/gi, '$1$2Google Books$3');
regex(/(.) at Discogs(\][\.,;]) Discogs\.com(\.| )/gi, '$1$2Discogs$3');
regex(/(\|\s?author\s?\=\s?)(?:posted|publishe[dr]|written)\s?(?:by|on):?\s/gi, '|author=$1');
regex(/\|\s?(?:work|publisher)(\s?\=MTV)\|\s?publisher\s*\=\s*(?:MTV Networks|Viacom)/gi, '|publisher$1=');
 
Line 333 ⟶ 339:
 
//rem duplicated publishers in separate fields (post); rem preceding nbsp
txt.valueregex(/\s?&nbsp;\s?(\|\s?(?:agency|publisher|work)\s*\=txt.value.replace\s*)/gi, ' $1');
regex(/(?:[‒–—―]+|&#124;)\s*(?:The |)([^\|\}&]{3,})(?:\.com|)\s*(\|\s?(?:agency|publisher|work)\s*\=\s*)\1(?=\s*[|}])/gi, '$2$1');
txt.value=txt.value.replaceregex(/\s(?:[‒–—―]+|&nbsp#124;)\s*([^\|\}&]{3,})(?:\.com|)\s*(\|\s?(?:agency|publisher|work)\s*\=\s*)(The |)\1(?=\s*[|}])/gi, ' $2$3$1');
 
//per [[Help:Citation Style 1#Elements not included]]
Line 344 ⟶ 351:
regex(/(\|\s?website\s*\=\s*(?:\[\[[^<{}\]]*\]\]|[^{}\|\}<>]*))(\s?\|[^}<>]*|)(\|\s?work\s*\=\s*(?:\[\[[^<{}\]]*\]\]|[^{}\|\}<>]*))(?=[\s\.]*[|}])/g, '$3$1$2');
 
/// removing identical/similar entries in 'work' and 'publisher', and in 'work' and 'website'
regex(/\|\s?work\s*\=\s*([^\=|\}\[<>]*)(\s?\|[^}<>]*|)\|\s?(?:publisher|workwebsite)\s*\=\s*(?:\1|\[\[\1\]\])\.?(?=[\s\.]*[|}])/g, '|work=$1$2'); //unlinked work
regex(/\|\s?work\s*\=\s*(\[\[(?:[^<\|\]]*)\]\]|[^<|\]]*)(\s?\|[^}<>]*|)\|\s?(?:publisher|workwebsite)\s*\=\s*(?:\1|\[\[\1\]\])\.?(?=[\s\.]*[|}])/g, '|work=$1$2'); //unpiped work
regex(/\|\s?work\s*\=\s*(\[\[(?:[^<|\]]*)\|([^}<>]*)\]\])(\|[^}<>]*|)\|\s?(?:publisher|website)\s*\=\s*(\1|2)\.?(?=\s*[|}])/g, '|work=$1$3'); //piped work
 
// regex(/\|\s?publisher\s*\=\s*([^\=|\}\[<>]*)(\s?\|[^}<>]*|)\|\s?(?:publisher|work)\s*\=\s*(?:\1|\[\[\1\]\])\.?(?=[\s\.]*[|}])/g, '|publisher=$1$2'); //unlinked work
// regex(/\|\s?publisher\s*\=\s*(\[\[(?:[^<\|\]]*)\]\]|[^<|\]]*)(\s?\|[^}<>]*|)\|\s?(?:publisher|work)\s*\=\s*(?:\1|\[\[\1\]\])\.?(?=[\s\.]*[|}])/g, '|publisher=$1$2'); //unpiped work
regex(/\|\s?publisher\s*\=\s*(\[\[(?:[^<|\]]*)\|([^}<>]*)\]\])(\|[^}<>]*|)\|\s?publisher\s*\=\s*(\1|2)\.?(?=\s*[|}])/g, '|publisher=$1$3'); //piped work
 
regex(/\|\s?___location\s*\=\s*New York(?: City|)\s*(\|[^}<>]*|)\|\s?___location\s*\=\s*(New York(?: City|)|USA)(?=[\s\.]*[|}])/g, '|___location=New York $1');
Line 381 ⟶ 390:
 
// removing other artefacts
regex(/(UEFA\]\])\.(?:co(?:m|m?\.\w{2})|\.\w{2})(?= ?[\|{}])/gi, '$1');
 
//dynamic columns for reflists; remove scroll bar
regex(/((?:[Rr]eferences|[Nn]otes)[ ]?={2,4}[\n\r])[\r\n\s]*<div (?:style|class)=[^>]*>([\S\s]*)<\/div>/g, '$1$2');
regex(/(?:\{\{[Rr]eflist\}\}|<[Rr]eferences ?\/>)/g, '{{reflist|colwidth=30em}}');
 
}