Module:Delink/sandbox: Difference between revisions

Content deleted Content added
fix bug when finding the interwiki
Synced (rip one testcase but whatever), dramatically improved efficiency of one of the regex patterns
 
(23 intermediate revisions by 10 users not shown)
Line 1:
-- This module de-links most wikitext.
 
require("strict")
local getArgs = require('Module:Arguments').getArgs
local yesno = require('Module:Yesno')
 
local p = {}
-- Often-used functions and variables
local htmlDecode = mw.text.decode
local uriDecode = mw.uri.decode
local isKnownLanguageTag = mw.language.isKnownLanguageTag
local namespaces = mw.site.namespaces
 
local getArgs
p = {}
 
local function whitespaceYesnodelinkReversePipeTrick(vals)
if s:match("^%[%[|.*[|\n]") then -- Check for newlines or multiple pipes.
-- Like yesno, but trims whitespace from vals and removes blank strings.
return s
if type(val) == 'string' then
val = val:match('^%s*(.-)%s*$')
if val == '' then
return nil
end
end
return yesnos:match(val"%[%[|(.*)%]%]")
 
end
 
local function p._delinkWikilinkdelinkPipeTrick(s)
-- We need to deal with colons, brackets, and commas, per [[Help:Pipe trick]].
-- s is a string starting with '[[' and ending with ']]'. It does not contain any other ']]' strings.
-- First, remove the text before the first colon, if any.
if s:match(":") then
s = s:match("%[%[.-:(.*)|%]%]")
-- If there are no colons, grab all of the text apart from the square brackets and the pipe.
else
s = s:match("%[%[(.*)|%]%]")
end
-- Next up, brackets and commas.
local linkText = s:sub(3, -3)
if s:match("%(.-%)$") then -- Brackets trump commas.
s = s:match("(.-) ?%(.-%)$")
elseif s:match(",") then -- If there are no brackets, display only the text before the first comma.
s = s:match("[^,]*")
end
return s
end
 
-- Return wikilink target |wikilinks=target
-- Deal with nested links
local function getDelinkedTarget(s)
if linkText:find('%[%[') then
local result = s
return '[[' .. s:sub(3):gsub('%[%[.-%]%]$', p._delinkWikilink)
-- Deal with the reverse pipe trick.
if result:match("%[%[|") then
return delinkReversePipeTrick(result)
end
result = mw.uri.decode(result, "PATH") -- decode percent-encoded entities. Leave underscores and plus signs.
result = mw.text.decode(result, true) -- decode HTML entities.
-- Check for bad titles. To do this we need to find the
-- title area of the link, i.e. the part before any pipes.
local target_area
if result:match("|") then -- Find if we're dealing with a piped link.
target_area = result:match("^%[%[(.-)|.*%]%]")
else
target_area = result:match("^%[%[(.-)%]%]")
end
 
-- Check for bad characters.
local titleArea, display = linkText:match('^(.-)|(.*)$')
if mw.ustring.match(target_area, "[%[%]<>{}%%%c\n]") and mw.ustring.match(target_area, "[%[%]<>{}%%%c\n]") ~= "?" then
return s
end
return target_area
end
 
local function getDelinkedLabel(s)
-- Process links with display areas. Pipe tricks aren't processed here, as we need to know more about the link title first.
local result = s
if display then
-- Deal with the reverse pipe trick.
display = htmlDecode(display, true) -- decode HTML entities.
if titleArea == ''result:match("%[%[|") then
return delinkReversePipeTrick(result)
-- We are dealing with a reverse pipe trick.
if display:find('[|\n]') or s == '[[|]]' then
-- The link is invalid.
return s
else
return display
end
elseif display ~= '' then
-- We are dealing with a normal piped link.
return display
end
end
 
result = mw.uri.decode(result, "PATH") -- decode percent-encoded entities. Leave underscores and plus signs.
titleArea = titleArea or linkText
result = mw.text.decode(result, true) -- decode HTML entities.
 
-- Check for bad titles. To do this we need to find the
-- Decode percent-encoded and HTML-encoded characters.
-- title area of the link, i.e. the part before any pipes.
titleArea = uriDecode(titleArea, 'PATH')
local target_area
titleArea = htmlDecode(titleArea, true)
if result:match("|") then -- Find if we're dealing with a piped link.
 
target_area = result:match("^%[%[(.-)|.*%]%]")
-- Find the fragment, if any.
else
local titleAreaNoFragment, fragment = titleArea:match('^(.-)#(.*)$')
target_area = result:match("^%[%[(.-)%]%]")
titleAreaNoFragment = titleAreaNoFragment or titleArea
end
 
-- Check for bad characters.
if titleAreaNoFragment:findmw.ustring.match('target_area, "[%[%]<>{}%%%c\n]'") and mw.ustring.match(target_area, "[%[%]<>{}%%%c\n]") ~= "?" then
return s
end
 
-- Check for categories, interwikis, and files.
-- Find the interwiki and the title. Actually, only the prefix before the first
local colon_prefix = result:match("%[%[(.-):.*%]%]") or "" -- Get the text before the first colon.
-- comma is counted as the interwiki, so the "title" may contain another interwiki
local ns = mw.site.namespaces[colon_prefix] -- see if this is a known namespace
-- prefix and/or a namespace name, but it's close enough for our purposes.
if mw.language.isKnownLanguageTag(colon_prefix) or (ns and (ns.canonicalName == "File" or ns.canonicalName == "Category")) then
local interwiki, title = titleAreaNoFragment:match('^(.-):(.*)$')
return ""
title = title or titleAreaNoFragment
 
-- Check for unescaped categories, interwikis, and files. If any are found,
-- return the blank string, as nothing would be displayed.
interwiki = interwiki or ''
local ns = namespaces[interwiki]
if isKnownLanguageTag(interwiki)
or ns and (ns.id == 6 or ns.id == 14)
then
return ''
end
 
-- Remove the colon if the link is using the [[Help:Colon trick]].
titleAreaif = titleArearesult:match('^"%[%[:(.*")$') or titleAreathen
result = "[[" .. result:match("%[%[:(.*%]%])")
end
 
-- Deal with links using the [[Help:Pipe trick]].
if mw.ustring.match(result, "^%[%[[^|]*|%]%]") then
if display == '' then
return delinkPipeTrick(result)
if fragment then
end
-- Fragments in a pipe trick are invalid, so return the input string.
return s
-- Find the display area of the wikilink
end
if result:match("|") then -- Find if we're dealing with a piped link.
-- Pipe tricks don't display interwikis, so we only need the title text here.
result = result:match("^%[%[.-|(.+)%]%]")
-- We need to remove parentheses and commas. Parentheses have priority.
-- Remove new lines from the display of multiline piped links,
local pipeTrickText = title:match('^(.-) ?%(.-%)$')
-- where the pipe is before the first new line.
if pipeTrickText then
result = result:gsub("\n", "")
return pipeTrickText
else
result = result:match("^%[%[(.-)%]%]")
-- If there are no parentheses, display only the text before the first comma.
pipeTrickText = title:match('(.-),.*$') or title
return pipeTrickText
end
end
 
return result
-- If we haven't returned any text yet, display the title area.
return titleArea
end
 
local function p._delinkURLdelinkURL(s)
-- Assume we have already delinked internal wikilinks, and that
-- we have been passed some text between two square brackets [foo].
 
-- If the text contains a line break it is not formatted as a URL, regardless of other content.
if s:findmatch('"\n'") then
return s
end
 
-- Check if the text has a valid URL prefix and at least one valid URL character.
local valid_url_prefixes = {'"//'", '"http://'", '"https://'", '"ftp://'", '"gopher://'", '"mailto:'", '"news:'", '"irc://'"}
local url_prefix
for i_ ,v in ipairs(valid_url_prefixes) do
if s:findmw.ustring.match(s, '^%[' .. v ..'[^"%s].*%]' ) then
url_prefix = v
break
end
end
 
-- Get display text
if not url_prefix then
return s
-- Deal with nested links or send back original string.
return '[' .. s:sub(2):gsub('%[.-%]', p._delinkURL)
end
s = s:match('"^%['" .. url_prefix .. '"(.*)%]'") -- Grab all of the text after the URL prefix and before the final square bracket.
s = s:match('^.-(["<> [].*)') or ''"" -- Grab all of the text after the first URL separator character ("<> ).
s = s:mw.ustring.match('s, "^%s*(%S.*)$'") or ''"" -- If the separating character was a space, trim it off.
 
local s_decoded = mw.text.decode(s, true)
if s_decoded:findmw.ustring.match('s_decoded, "%c'") then
return s
else
return s_decoded
end
 
return s_decoded
 
end
 
local function delinkLinkClass(text, pattern, delinkFunction)
if type(text) ~= "string" then
error("Attempt to de-link non-string input.", 2)
end
if type(pattern) ~= "string" or mw.ustring.sub(pattern, 1, 1) ~= "^" then
error('Invalid pattern detected. Patterns must begin with "^".', 2)
end
-- Iterate over the text string, and replace any matched text. using the
-- delink function. We need to iterate character by character rather
-- than just use gsub, otherwise nested links aren't detected properly.
local result = ""
while text ~= "" do
-- Replace text using one iteration of gsub.
text = mw.ustring.gsub(text, pattern, delinkFunction, 1)
-- Append the left-most character to the result string.
result = result .. mw.ustring.sub(text, 1, 1)
text = mw.ustring.sub(text, 2, -1)
end
return result
end
 
function p._delink(args)
local text = args[1] or ''""
if whitespaceYesno(args.refs) == "yes" then
-- Remove any [[Help:Strip markers]] representing ref tags. In most situations
-- this is not a good idea - only use it if you know what you are doing!
text = mw.ustring.gsub(text, '"UNIQ%w*%-ref%-%d*%-QINU'", ''"")
end
if whitespaceYesno(args.comments) ~= false"no" then
text = text:gsub('"<!%-%-.-%-%->'", ''"") -- Remove html comments.
end
 
if whitespaceYesno(args.wikilinks) ~= false then
if args.wikilinks ~= "no" and args.wikilinks ~= "target" then
text = text:gsub('%[%[.-%]%]', p._delinkWikilink) -- De-link wikilinks.
-- De-link wikilinks and return the label portion of the wikilink.
text = delinkLinkClass(text, "^%[%[.-%]%]", getDelinkedLabel)
elseif args.wikilinks == "target" then
-- De-link wikilinks and return the target portions of the wikilink.
text = delinkLinkClass(text, "^%[%[.-%]%]", getDelinkedTarget)
end
if whitespaceYesno(args.urls) ~= false"no" then
text = text:gsubdelinkLinkClass('text, "^%[.-%]'", p._delinkURLdelinkURL) -- De-link URLs.
end
if whitespaceYesno(args.whitespace) ~= false"no" then
-- Replace single new lines with a single space, but leave double new lines
-- and new lines only containing spaces or tabs before a second new line.
text = text:mw.ustring.gsub('text, "([^\n \t][ \t]*)\n([ \t]*[^\n \t])'", '"%1 %2'")
text = text:gsub('"[ \t]+'", '" '") -- Remove extra tabs and spaces.
end
return text
Line 170 ⟶ 204:
 
function p.delink(frame)
if not getArgs then
local args = frame:getParent().args
getArgs = require('Module:Arguments').getArgs
return p._delink(args)
end
return p._delink(getArgs(frame, {wrappers = 'Template:Delink'}))
end