Module:Sandbox/Hellknowz/Test: Difference between revisions

Content deleted Content added
preprocess after all
back
 
(41 intermediate revisions by the same user not shown)
Line 128:
if (month2 and not year2) then return PARSERESULT_INCOMPLETE end -- same but other end
-- While technically there are more cases, none should have been matched and been given to us
local date1, time1, date2, time2
 
Line 181:
-- These are the element type "constants" for readability mostly
local ELEMENT_INVALID = 1
local ELEMENT_ONETWODIGITS = 2 -- '1' '12' '01'
local ELEMENT_FOURDIGITS = 3 -- '1234'
local ELEMENT_WHITESPACE = 4 -- ' ' ' '
local ELEMENT_MONTHWORD = 5 -- 'May' 'February' 'Aug'
local ELEMENT_COMMA = 6 -- ',' ', '
local ELEMENT_DASH = 7 -- '-' ' - ' ' — ' '- ' ' -'
local ELEMENT_DATESEPARATOR = 8 -- '-'
local ELEMENT_TIMESEPARATOR = 9 -- ':'
local ELEMENT_TIMEPERIOD = 10 -- 'am' 'p.m.'
local ELEMENT_PERIODWHITESPACE = 11 -- '.' or '. '
local ELEMENT_ONETWODIGITSWITHORDINAL = 12 -- '12th' '3rd'
 
function seekNextElement()
Line 195 ⟶ 197:
-- Profiler says mw.ustring.find is the bottleneck, probably because it's unicode; not sure how to improve though besides writing my own pattern matcher
 
-- Digits with letters
local foundPositionStart, foundPositionEnd, foundMatch, foundMatch2 = mw.ustring.find(seekString, '^([0-9]+)([a-z]+)%.?', currentPosition)
if (foundPositionStart) then
--currentPosition = foundPositionEnd + 1 -- this is our new start ___location -- only if we return
-- Additionally check how many digits we actually have, as arbitrary number isn't valid
if (#foundMatch <= 2) then -- most likely a day number
if (foundMatch2 == 'st' or foundMatch2 == 'nd' or foundMatch2 == 'rd' or foundMatch2 == 'th') then -- won't bother checking against a number, no false positives that I saw in 120k cases
currentPosition = foundPositionEnd + 1 -- this is our new start ___location (forced to do this here, since we don't always return)
return ELEMENT_ONETWODIGITSWITHORDINAL, tonumber(foundMatch), (currentPosition > mw.ustring.len(seekString))
--else -- let it capture digits again, this time '10am' '8p.m.' will be separate
-- return ELEMENT_INVALID -- not a valid ordinal indicator
end
--else -- let it capture digits again, this time '10am' '8p.m.' will be separate
-- return ELEMENT_INVALID -- just the invalid, the number of digits (3+) won't match any patterns
end
end
-- Digits
local foundPositionStart, foundPositionEnd, foundMatch = mw.ustring.find(seekString, '^([0-9]+)', currentPosition)
Line 250 ⟶ 270:
currentPosition = foundPositionEnd + 1 -- this is our new start ___location
return ELEMENT_COMMA, foundMatch, (currentPosition > mw.ustring.len(seekString))
end
-- Period and any following whitespace ('Feb. 2010' or '29. June')
foundPositionStart, foundPositionEnd, foundMatch = mw.ustring.find(seekString, '^(%.%s*)', currentPosition)
if (foundPositionStart) then
currentPosition = foundPositionEnd + 1 -- this is our new start ___location
return ELEMENT_PERIODWHITESPACE, foundMatch, (currentPosition > mw.ustring.len(seekString))
end
Line 352 ⟶ 379:
-- Only immediate big improvement is to only seekNextElement() when actually checking that deep, though this will make a (even bigger) mess
 
if (elements[1] == ELEMENT_ONETWODIGITS or elements[1] == ELEMENT_ONETWODIGITSWITHORDINAL) then -- '3' or '10' or '12th'
if (elements[2] == ELEMENT_WHITESPACE or elements[2] == ELEMENT_PERIODWHITESPACE) then -- '3 ' or '3. '
if (elements[3] == ELEMENT_MONTHWORD) then -- '3 May'
if (numberOfElements == 3) then return checkAndOutput(nil, values[3], values[1], nil, nil, nil) end
if (elements[4] == ELEMENT_WHITESPACE or elements[4] == ELEMENT_PERIODWHITESPACE or elements[4] == ELEMENT_COMMA) then -- '3 May ' or '3 Feb. ' or '3 May, '
if (elements[5] == ELEMENT_FOURDIGITS) then -- '3 May 2013'
if (numberOfElements == 5) then return checkAndOutput(values[5], values[3], values[1], nil, nil, nil) end
Line 380 ⟶ 407:
end
elseif (elements[6] == ELEMENT_DASH or elements[6] == ELEMENT_DATESEPARATOR) then -- '3 May 2013 - '
if (elements[7] == ELEMENT_ONETWODIGITS or elements[7] == ELEMENT_ONETWODIGITSWITHORDINAL) then -- '3 May 2013 - 12' or '3rd May 2013 - 12th'
if (elements[8] == ELEMENT_WHITESPACE) then -- '3 May 2013 - 12 '
if (elements[9] == ELEMENT_MONTHWORD) then -- '3 May 2013 - 12 February'
Line 394 ⟶ 421:
end
elseif (elements[4] == ELEMENT_DASH or elements[4] == ELEMENT_DATESEPARATOR) then -- '3 May - '
if (elements[5] == ELEMENT_ONETWODIGITS or elements[5] == ELEMENT_ONETWODIGITSWITHORDINAL) then -- '3 May - 12' or '3rd May - 12th'
if (elements[6] == ELEMENT_WHITESPACE) then -- '3 May - 12 '
if (elements[7] == ELEMENT_MONTHWORD) then -- '3 May - 12 October'
Line 409 ⟶ 436:
end
elseif (elements[2] == ELEMENT_DASH or elements[2] == ELEMENT_DATESEPARATOR) then -- '3 - '
if (elements[3] == ELEMENT_ONETWODIGITS or elements[3] == ELEMENT_ONETWODIGITSWITHORDINAL) then -- '3 - 12' or '3rd - 12th'
if (elements[4] == ELEMENT_WHITESPACE) then -- '3 - 12 '
if (elements[5] == ELEMENT_MONTHWORD) then -- '3 - 12 May'
Line 432 ⟶ 459:
local i = 0 -- this is our offset from the closest possible ___location for date seeking
 
if (elements[2] == ELEMENT_TIMESEPARATOR and elements[1] ~= ELEMENT_ONETWODIGITSWITHORDINAL) then -- '10:' but not '10th:'
possibleHour = values[1] -- only once we see ':' (or 'am' below) it is likely a time
if (elements[3] == ELEMENT_ONETWODIGITS) then -- '10:28'
Line 508 ⟶ 535:
if (elements[2] == ELEMENT_DATESEPARATOR) then -- '2013-'
if (elements[3] == ELEMENT_ONETWODIGITS) then -- '2013-05'
--if (numberOfElements == 3) then return checkAndOutput(values[1], values[3], nil, nil, nil, nil) end
-- This is actually ambiguous -- 2008-12 can be years 2008 to 2012 or it could be Decemeber 2008; few cases, so just ignoring
if (elements[4] == ELEMENT_DATESEPARATOR) then -- '2013-05-'
if (elements[5] == ELEMENT_ONETWODIGITS) then -- '2013-05-03'
Line 541 ⟶ 569:
if (numberOfElements == 3) then return checkAndOutput(values[1], nil, nil, nil, nil, nil, values[3], nil, nil, nil, nil, nil) end
end
elseif (elements[2] == ELEMENT_WHITESPACE or elements[2] == ELEMENT_COMMA) then -- '2013 ' or '2013, '
if (elements[3] == ELEMENT_MONTHWORD) then -- '2013 May'
if (numberOfElements == 3) then return checkAndOutput(values[1], values[3], nil, nil, nil, nil) end
-- 2013 May - 2013 April (let's see first if this is ever used real-world)
if (elements[4] == ELEMENT_WHITESPACE) then -- '2013 May '
if (elements[5] == ELEMENT_ONETWODIGITS) then -- '2013 May 15'
if (numberOfElements == 5) then return checkAndOutput(values[1], values[3], values[5], nil, nil, nil) end
end
end
elseif (elements[3] == ELEMENT_ONETWODIGITS) then -- '2013 15' or '2013, 15'
if (elements[4] == ELEMENT_WHITESPACE) then -- '2013 15 '
if (elements[5] == ELEMENT_MONTHWORD) then -- '2013 15 May'
if (numberOfElements == 5) then return checkAndOutput(values[1], values[5], values[3], nil, nil, nil) end
end
end
end
end
Line 550 ⟶ 589:
elseif (elements[1] == ELEMENT_MONTHWORD) then -- 'May'
if (numberOfElements == 1) then return checkAndOutput(nil, values[1], nil, nil, nil, nil) end
if (elements[2] == ELEMENT_WHITESPACE or elements[2] == ELEMENT_PERIODWHITESPACE) then -- 'May ' or 'Feb. '
if (elements[3] == ELEMENT_ONETWODIGITS or elements[3] == ELEMENT_ONETWODIGITSWITHORDINAL) then -- 'May 3' or 'May 3rd'
if (numberOfElements == 3) then return checkAndOutput(nil, values[1], values[3], nil, nil, nil) end
if (elements[4] == ELEMENT_COMMA or elements[4] == ELEMENT_WHITESPACE) then -- 'May 3, '
Line 579 ⟶ 618:
if (elements[7] == ELEMENT_MONTHWORD) then -- 'May 3, 2013 - February'
if (elements[8] == ELEMENT_WHITESPACE) then -- 'May 3, 2013 - February '
if (elements[9] == ELEMENT_ONETWODIGITS or elements[3] == ELEMENT_ONETWODIGITSWITHORDINAL) then -- 'May 3, 2013 - February 12' or 'May 3rd, 2013 - February 12th'
if (elements[10] == ELEMENT_COMMA or elements[10] == ELEMENT_WHITESPACE) then -- 'May 3, 2013 - February 12, '
if (elements[11] == ELEMENT_FOURDIGITS) then -- 'May 3, 2013 - February 12, 2014'
Line 593 ⟶ 632:
if (elements[5] == ELEMENT_MONTHWORD) then -- 'May 3 - June'
if (elements[6] == ELEMENT_WHITESPACE) then -- 'May 3 - June '
if (elements[7] == ELEMENT_ONETWODIGITS or elements[3] == ELEMENT_ONETWODIGITSWITHORDINAL) then -- 'May 3 - June 12' or 'May 3rd - June 12th'
if (numberOfElements == 7) then return checkAndOutput(nil, values[1], values[3], nil, nil, nil, nil, values[5], values[7], nil, nil, nil) end
if (elements[8] == ELEMENT_COMMA or elements[8] == ELEMENT_WHITESPACE) then -- 'May 3 - June 12, '
Line 602 ⟶ 641:
end
end
elseif (elements[5] == ELEMENT_ONETWODIGITS or elements[3] == ELEMENT_ONETWODIGITSWITHORDINAL) then -- 'May 3 - 12' or 'May 3rd - 12th'
if (numberOfElements == 5) then return checkAndOutput(nil, values[1], values[3], nil, nil, nil, nil, values[1], values[5], nil, nil, nil) end
if (elements[6] == ELEMENT_COMMA or elements[6] == ELEMENT_WHITESPACE) then -- 'May 3 - 12, '
Line 630 ⟶ 669:
end
end
end
elseif (elements[2] == ELEMENT_COMMA) then -- 'May, '
if (elements[3] == ELEMENT_FOURDIGITS) then -- 'May, 2012'
if (numberOfElements == 3) then return checkAndOutput(values[3], values[1], nil, nil, nil, nil) end
end
end
Line 636 ⟶ 679:
return PARSERESULT_UNRECOGNIZED -- the combination of elements was not a recognized one
end
 
end
 
function hasMetadataTemplates(input)
-- This is a basic list of the template names for metadata emiting tempaltes, there are inr eality more templates and more redirects
if (string.match(input, '%{%{[Ss]tart[ %-]?date')) then return true end
if (string.match(input, '%{%{[Ee]nd[ %-]?date')) then return true end
if (string.match(input, '%{%{[Bb]irth[ %-]?date')) then return true end
if (string.match(input, '%{%{[Dd]eath[ %-]?date')) then return true end
if (string.match(input, '%{%{[Bb]irth[ %-]?year')) then return true end
if (string.match(input, '%{%{[Dd]eath[ %-]?year')) then return true end
if (string.match(input, '%{%{[Ff]ilm ?date')) then return true end
if (string.match(input, '%{%{[Rr]elease ?date')) then return true end
if (string.match(input, '%{%{[ISO[ %-]date')) then return true end
 
return false
 
end
Line 660 ⟶ 720:
 
if (result == PARSERESULT_FAIL) then if (frame.args[2] == 'pretty') then return frame:preprocess('\'\'{{gray|Failed parse}}\'\'') else return 'Failed parse' end end
if (result == PARSERESULT_UNRECOGNIZED) then if (frame.args[2] == 'pretty') then return frame:preprocess('\'\'{{gray|Unrecognized pattern}}\'\'') else return 'Unrecognized pattern' end end
local s
if (hasMetadataTemplates(input)) then s = 'Has metadata template' else s = 'Unrecognized pattern' end
if (frame.args[2] == 'pretty') then return frame:preprocess('\'\'{{gray|'..s..'}}\'\'') else return s end
end
if (result == PARSERESULT_INVALID) then if (frame.args[2] == 'pretty') then return frame:preprocess('\'\'{{maroon|Invalid date/time}}\'\'') else return 'Invalid date/time' end end
if (result == PARSERESULT_INVALIDRANGE) then if (frame.args[2] == 'pretty') then return frame:preprocess('\'\'{{maroon|Invalid date range}}\'\'') else return 'Invalid date range' end end
Line 679 ⟶ 743:
function stripFieldExtras(value)
-- todo: do progressive scan like with that seek string just for ref tags and such
-- note that we can't just replace matches with whitespace and catch them all, because it could be like '3 August<!---->20<ref/>12'
local matchNoRefmatchStrip = value:match('^([^<]+-)<ref[^>]*>[^<]*</ref><ref[^>]*>[^<]*</ref><ref[^>]*>[^<]*</ref>$') -- basic refrefs (quite common)
if (matchNoRefmatchStrip) then return matchNoRefmw.text.trim(matchStrip) end
 
matchNoRefmatchStrip = value:match('^([^<]+-)<ref[^>]*>[^<]*</ref><ref[^>]*>[^<]*</ref>$') -- basic refrefs (quite common)
if (matchNoRefmatchStrip) then return matchNoRefmw.text.trim(matchStrip) end
 
matchNoRefmatchStrip = value:match('^([^<]+-)<ref[^>]*>[^<]*</ref>$') -- basic refrefs (quite common)
if (matchNoRefmatchStrip) then return matchNoRefmw.text.trim(matchStrip) end
matchStrip = value:match('^([^<]-)<ref[^>]*/><ref[^>]*/><ref[^>]*/>$') -- basic named ref (quite common)
return value
if (matchStrip) then return mw.text.trim(matchStrip) end
matchStrip = value:match('^([^<]-)<ref[^>]*/><ref[^>]*/>$') -- basic named ref (quite common)
end
if (matchStrip) then return mw.text.trim(matchStrip) end
matchStrip = value:match('^([^<]-)<ref[^>]*/>$') -- basic named ref (quite common)
if (matchStrip) then return mw.text.trim(matchStrip) end
matchStrip = value:match('^<!--.--->(.+)$') -- comment before (sometimes used for notes to editors [not yet seen metadata-related comment])
if (matchStrip) then return mw.text.trim(matchStrip) end
 
matchStrip = value:match('^([^<]-)<!--.--->$') -- comment after (sometimes used for notes to editors)
function hasMetadataTemplates(input)
if (matchStrip) then return mw.text.trim(matchStrip) end
-- This is a basic list of the template names for metadata emiting tempaltes, there are inr eality more templates and more redirects
if (string.match(input, '%{%{[Ss]tart[ %-]?date')) then return true end
if (string.match(input, '%{%{[Ee]nd[ %-]?date')) then return true end
if (string.match(input, '%{%{[Bb]irth[ %-]?date')) then return true end
if (string.match(input, '%{%{[Dd]eath[ %-]?date')) then return true end
if (string.match(input, '%{%{[Bb]irth[ %-]?year')) then return true end
if (string.match(input, '%{%{[Dd]eath[ %-]?year')) then return true end
if (string.match(input, '%{%{[Ff]ilm ?date')) then return true end
if (string.match(input, '%{%{[ISO[ %-]date')) then return true end
 
matchStrip = value:match('^{{[Ff]lag ?icon[^}]+}}%s*{{[Ff]lag ?icon[^}]+}}(.+)$') -- 2 flag icons (also more common than one would think)
return false
if (matchStrip) then return mw.text.trim(matchStrip) end
 
matchStrip = value:match('^{{[Ff]lag ?icon[^}]+}}(.+)$') -- flag icon (quite common, although against MOS:ICON)
if (matchStrip) then return mw.text.trim(matchStrip) end
matchStrip = value:match('^([^<]-){{[Ff]lag ?icon[^}]+}}%s*{{[Ff]lag ?icon[^}]+}}$') -- after as well
if (matchStrip) then return mw.text.trim(matchStrip) end
 
matchStrip = value:match('^([^<]-){{[Ff]lag ?icon[^}]+}}$')
if (matchStrip) then return mw.text.trim(matchStrip) end
 
matchStrip = value:match('^{{[Nn]o ?wrap *%|([^}%|]+)}}$') -- no-wrapped value
if (matchStrip) then return mw.text.trim(matchStrip) end
 
matchStrip = value:match('^{{[Nn]o ?break *%|([^}%|]+)}}$') -- no-wrapped redirect
if (matchStrip) then return mw.text.trim(matchStrip) end
 
matchStrip = value:match('^{{[Nn]obr *%|([^}%|]+)}}$') -- no-wrapped redirect
if (matchStrip) then return mw.text.trim(matchStrip) end
 
matchStrip = value:match('^{{[Jj] *%|([^}%|]+)}}$') -- no-wrapped redirect, yes '{{j|...}}'
if (matchStrip) then return mw.text.trim(matchStrip) end
 
return value -- if we didn't match anything, abort now
end
 
Line 719 ⟶ 807:
end
 
input = stripFieldExtras(mw.text.trim(input))
 
-- If there is nothing but whitespace, don't bother
Line 769 ⟶ 857:
 
function main.outputRawStripped(frame)
return stripFieldExtras(mw.text.trim(mw.text.decode(mw.text.unstrip(frame.args[1]))))
end
 
function main.reoutputme(frame)
--if (frame.args.preprocess and frame.args.preprocess == 'yes') then
-- return frame:preprocess(mw.text.decode(mw.text.unstrip(frame.args[1])))
.. '[[Category:Dummy]]' --else
-- return mw.text.decode(mw.text.unstrip(frame.args[1]))
else
--end
return mw.text.decode(mw.text.unstrip(frame.args[1])) .. '[[Category:Dummy]]'
end
----[[
-- frame.args[1]
-- frame:getParent().args[1]
-- frame:getArgument(1)
--do return frame:getArgument(1):expandTo('text/plain') end
--[[local input = mw.text.decode(mw.text.unstrip(frame.args[1]))
s = 'Len=' .. #input .. ' '
Line 785 ⟶ 881:
s = s .. string.sub(input, i, i) .. ' '
end
return s--]]
end