Content deleted Content added
No edit summary |
No edit summary |
||
Line 24:
local cfg = mw.loadData ('Module:Lang/configuration' .. (mw.getCurrentFrame():getTitle():match ('/sandbox') or '')); -- for internationalization
local is_latn_data = mw.loadData ('Module:Lang/data/is latn data');
local sizeof_ranges_t = is_latn_data.sizeof_ranges_t;
local namespace = mw.title.getCurrentTitle().namespace; -- used for categorization
Line 1,019 ⟶ 1,022:
]]
local function text_script_match_test (script, is_latn_text, pos, char)
local scripts_t = {['latf'] = true, ['latg'] = true, ['latn'] = true}; -- unicode 'latn' scripts; 'latf' and 'latg' are font variants so there are no Fraktur or Gaelic codepoints
if is_set (script) then -- don't bother with the rest of this if <script> is nil or empty string
Line 1,029 ⟶ 1,032:
else -- when text is not wholly Latn script
if scripts_t[script] then -- but a Latn script is specified
return substitute (cfg.text_script_match_test_t.latn_scr_mismatch, {pos, char}); -- emit an error message with position of first offending character
end
end
Line 1,036 ⟶ 1,039:
--[[--------------------------< B I
conducts a binary search of <ranges_t> for a sub-range that holds <target>.
returns boolean true if a sub-range holding <target> is found; boolean false else.
]]
local function
local idx_top = sizeof_ranges_t; -- initialize to index of last key (number of keys)
if (target < ranges_t[idx_bot][1]) or (target > ranges_t[idx_top][2]) then -- invalid; target out of range
return; -- abandon
end
local idx_mid; -- calculated index of range midway between top index and bottom index
local flag = false; -- flag to tell us when we've evaluated last (highest) range in <ranges_t>
while 1 do
idx_mid = math.ceil ((idx_bot + idx_top) / 2); -- get the mid-point in the <ranges_t> sequence
if (target >= ranges_t[idx_mid][1]) and (target <= ranges_t[idx_mid][2]) then -- indexed range low value <= target <= indexed range high value
return true; -- we found the range that holds the <target> character; return true
elseif (target > ranges_t[idx_mid][2]) then -- is <target> > indexed range high value?
idx_bot = idx_mid; -- adjust <idx_bot> up
else -- here when <target> less than indexed range low value
idx_top = idx_mid - 1; -- adjust <idx_top> down
end
if flag then
break; -- here when we just evaluated the last range and <target> not found
end
if not flag and (idx_bot == idx_top) then -- set true just before we evaluate the last (highest) range in <ranges_t>
flag = true;
end
end
end
--[[--------------------------< I S _ L A T I N >--------------------------------------------------------------
compare <text> as codepoints to lists of known codepoints accepted as Latn script
returns boolean true and modified <text> when <text> is wrapped in accept-as-written markup
returns boolean true and <text> when codepoint is known
returns boolean false, <text>, non-Latn codepoint position in <text> (left to right), and the codepoint character
when codepoint is not known
TODO: when text has accept-as-written markup, return a non-boolean value to indicate that <text> is not wholly
latn script? Use that return value to create non-Latn html lang= attribute because <text> isn't really
latn so lang=und (undetermined)? or instead, omit the -Latn subtag? (without -latn need to force |italic=yes)
]]
local function is_latin (text, tag)
local count;
text, count = text:gsub ('^%(%((.+)%)%)$', '%1'); -- remove accept-as-written markup if present
if 0 ~= count then
return true, text; -- markup present so assume that <text> is Latn-script
end
local pos = 0; -- position counter for error messaging
for codepoint in mw.ustring.gcodepoint (text) do -- fetch each code point
pos = pos + 1; -- bump the position counter
if not is_latn_data.singles_t[codepoint] and -- codepoint not found in the singles list?
not binary_search (codepoint, is_latn_data.ranges_t) and -- codepoint not a member of a listed range?
not (tag and is_latn_data.specials_t[codepoint] and is_latn_data.specials_t[codepoint][tag]) then -- not a language-specific codepoint?
return false, text, pos, mw.ustring.char (codepoint); -- codepoint not known; return false with codepoint position and character representation
end
end
return true, text; -- is known; return <text>
end
Line 1,139 ⟶ 1,177:
end
local is_latn_text, pos
is_latn_text, args.text, pos, char= is_latin (args.text, args.code); -- make a boolean
msg = text_script_match_test (subtags.script, is_latn_text, pos, char)
if msg then -- if an error detected then there is an error message
return make_error_msg (msg, args, template);
Line 1,332 ⟶ 1,371:
if args.translit then
local latn, pos
latn, args.translit, pos, char = is_latin (args.translit, args[1] or args.code);
if not latn then
return make_error_msg (substitute (cfg.lang_xx_t.translit_nonlatn, {pos, char}), args, template);
end
end
Line 1,364 ⟶ 1,404:
end
local is_latn_text, text, pos, char = is_latin (args.text,
args.text = text; -- may have been modified (accept-as-written markup removed)
msg = text_script_match_test (subtags.script, is_latn_text, pos, char)
if msg then -- if an error detected then there is an error message
return make_error_msg (msg, args, template);
Line 1,860 ⟶ 1,901:
end
local
is_latn_text, args.text, pos, char= is_latin (args.text, args.code); -- is latn text? strip accept-as-written markup
if not is_latn_text then -- when text is not latn
return make_error_msg (substitute (cfg.lang_xx_t.translit_nonlatn, {pos, char}), args, template); -- abandon with error message
end
|