Module:Lang: Difference between revisions

Content deleted Content added
No edit summary
No edit summary
Line 24:
 
local cfg = mw.loadData ('Module:Lang/configuration' .. (mw.getCurrentFrame():getTitle():match ('/sandbox') or '')); -- for internationalization
 
local is_latn_data = mw.loadData ('Module:Lang/data/is latn data');
local sizeof_ranges_t = is_latn_data.sizeof_ranges_t;
 
local namespace = mw.title.getCurrentTitle().namespace; -- used for categorization
Line 1,019 ⟶ 1,022:
]]
 
local function text_script_match_test (script, is_latn_text, pos, char)
local scripts_t = {['latf'] = true, ['latg'] = true, ['latn'] = true}; -- unicode 'latn' scripts; 'latf' and 'latg' are font variants so there are no Fraktur or Gaelic codepoints
if is_set (script) then -- don't bother with the rest of this if <script> is nil or empty string
Line 1,029 ⟶ 1,032:
else -- when text is not wholly Latn script
if scripts_t[script] then -- but a Latn script is specified
return substitute (cfg.text_script_match_test_t.latn_scr_mismatch, {pos, char}); -- emit an error message with position of first offending character
end
end
Line 1,036 ⟶ 1,039:
 
 
--[[--------------------------< B I SN A R Y _ LS E A TR IC NH >--------------------------------------------------------------
 
conducts a binary search of <ranges_t> for a sub-range that holds <target>.
Created because unicode does not have Latin theta 'θ' character. The Greek 'θ' is, apparently, commonly used
 
in certain romanizations. This function attempts to discover if <text> script is Latn with Greek 'θ' or some
returns boolean true if a sub-range holding <target> is found; boolean false else.
other script with Greek 'θ'.
 
when <text> is or has Greek 'θ':
for {{lang}}, {{langx}}, and {{lang-??}}, returns boolean true when <text>:
has theta and all other characters are Latn
has theta as the only character, <script> is Latn
for {{transliteration}}, returns boolean true when <text>:
has theta and all other characters are Latn
has theta as the only character
]]
 
local function is_latinbinary_search (texttarget, script, translranges_t)
textlocal =idx_bot mw.text.decode= (text)1; -- convertinitialize htmlto entitiesindex toof first characterskey
local idx_top = sizeof_ranges_t; -- initialize to index of last key (number of keys)
script = (script and script:lower()) or nil; -- ensure lowercase for comparisons
local whitelist = '[ʻʼʽʾʿΔαβγδθλσφχϑъьӾӿᾱῑ῾上入去平阳阴]'
 
if (target < ranges_t[idx_bot][1]) or (target > ranges_t[idx_top][2]) then -- invalid; target out of range
if not transl then -- testing {{lang}} or {{langx}} <text>
return; -- abandon
if '' == mw.ustring.gsub (text, whitelist, '') then -- when <text> uses only whitelisted characters
return ('latn' == script); -- true when latn script; false when other script or no script
end
end
 
local idx_mid; -- calculated index of range midway between top index and bottom index
if mw.ustring.find (text, whitelist) then -- does <text> contain characters from the whitelist?
local flag = false; -- flag to tell us when we've evaluated last (highest) range in <ranges_t>
text = mw.ustring.gsub (text, whitelist, 'x'); -- replace whitelisted characters from <text> with known Latn-script char 'x'
 
-- if 0 == text:len() then -- will be zero if theta was the only character in <text>
while 1 do
if '' == text then -- will be empty string if the only character(s) in <text> are in <whitelist>
idx_mid = math.ceil ((idx_bot + idx_top) / 2); -- get the mid-point in the <ranges_t> sequence
if transl then -- not nil for {{transliteration}}; assume Latn because this is 'transliteration' template
if (target >= ranges_t[idx_mid][1]) and (target <= ranges_t[idx_mid][2]) then -- indexed range low value <= target <= indexed range high value
return true;
return true; -- we found the range that holds the <target> character; return true
end
return (script and ('latn' == script)) or false; -- {{lang}}, {{langx}}, and {{lang-??}}; true when script is Latn; false else
elseif (target > ranges_t[idx_mid][2]) then -- is <target> > indexed range high value?
idx_bot = idx_mid; -- adjust <idx_bot> up
 
else -- here when <target> less than indexed range low value
idx_top = idx_mid - 1; -- adjust <idx_top> down
end
 
if flag then
break; -- here when we just evaluated the last range and <target> not found
end
if not flag and (idx_bot == idx_top) then -- set true just before we evaluate the last (highest) range in <ranges_t>
flag = true;
end
return unicode.is_Latin (text); -- return true when all characters in modified <text> are Latn script; false else
end
end
 
 
return unicode.is_Latin (text); -- return true when all characters in <text> are Latn script; false else
--[[--------------------------< I S _ L A T I N >--------------------------------------------------------------
 
compare <text> as codepoints to lists of known codepoints accepted as Latn script
 
returns boolean true and modified <text> when <text> is wrapped in accept-as-written markup
 
returns boolean true and <text> when codepoint is known
 
returns boolean false, <text>, non-Latn codepoint position in <text> (left to right), and the codepoint character
when codepoint is not known
 
TODO: when text has accept-as-written markup, return a non-boolean value to indicate that <text> is not wholly
latn script? Use that return value to create non-Latn html lang= attribute because <text> isn't really
latn so lang=und (undetermined)? or instead, omit the -Latn subtag? (without -latn need to force |italic=yes)
 
]]
 
local function is_latin (text, tag)
local count;
text, count = text:gsub ('^%(%((.+)%)%)$', '%1'); -- remove accept-as-written markup if present
if 0 ~= count then
return true, text; -- markup present so assume that <text> is Latn-script
end
 
local pos = 0; -- position counter for error messaging
for codepoint in mw.ustring.gcodepoint (text) do -- fetch each code point
pos = pos + 1; -- bump the position counter
 
if not is_latn_data.singles_t[codepoint] and -- codepoint not found in the singles list?
not binary_search (codepoint, is_latn_data.ranges_t) and -- codepoint not a member of a listed range?
not (tag and is_latn_data.specials_t[codepoint] and is_latn_data.specials_t[codepoint][tag]) then -- not a language-specific codepoint?
return false, text, pos, mw.ustring.char (codepoint); -- codepoint not known; return false with codepoint position and character representation
end
end
return true, text; -- is known; return <text>
end
 
Line 1,139 ⟶ 1,177:
end
 
local is_latn_text, pos = is_latin (args.text, subtags.script)char; -- make a boolean
is_latn_text, args.text, pos, char= is_latin (args.text, args.code); -- make a boolean
 
msg = text_script_match_test (subtags.script, is_latn_text, pos, char)
if msg then -- if an error detected then there is an error message
return make_error_msg (msg, args, template);
Line 1,332 ⟶ 1,371:
 
if args.translit then
local latn, pos = is_latin (args.translit, nil, true)char;
latn, args.translit, pos, char = is_latin (args.translit, args[1] or args.code);
if not latn then
return make_error_msg (substitute (cfg.lang_xx_t.translit_nonlatn, {pos, char}), args, template);
end
end
Line 1,364 ⟶ 1,404:
end
local is_latn_text, text, pos, char = is_latin (args.text, subtags.scriptcode); -- make a boolean
args.text = text; -- may have been modified (accept-as-written markup removed)
 
msg = text_script_match_test (subtags.script, is_latn_text, pos, char)
if msg then -- if an error detected then there is an error message
return make_error_msg (msg, args, template);
Line 1,860 ⟶ 1,901:
end
 
local latnis_latn_text, pos = is_latin (args.text, nil, true)char;
is_latn_text, args.text, pos, char= is_latin (args.text, args.code); -- is latn text? strip accept-as-written markup
if not latn then -- text is not latn
if not is_latn_text then -- when text is not latn
return make_error_msg (substitute (cfg.lang_xx_t.translit_nonlatn, {pos}), args, template);
return make_error_msg (substitute (cfg.lang_xx_t.translit_nonlatn, {pos, char}), args, template); -- abandon with error message
end