Revision as of 00:45, 6 January 2025 view source Trappist the monk (talk \| contribs) Administrators 494,436 edits No edit summary ← Previous edit		Revision as of 19:26, 18 January 2025 view source Trappist the monk (talk \| contribs) Administrators 494,436 edits No edit summary Next edit →
Line 24: local cfg = mw.loadData ('Module:Lang/configuration' .. (mw.getCurrentFrame():getTitle():match ('/sandbox') or '')); -- for internationalization local is_latn_data = mw.loadData ('Module:Lang/data/is latn data'); local sizeof_ranges_t = is_latn_data.sizeof_ranges_t; local namespace = mw.title.getCurrentTitle().namespace; -- used for categorization Line 1,019 ⟶ 1,022: ]] local function text_script_match_test (script, is_latn_text, pos, char) local scripts_t = {['latf'] = true, ['latg'] = true, ['latn'] = true}; -- unicode 'latn' scripts; 'latf' and 'latg' are font variants so there are no Fraktur or Gaelic codepoints if is_set (script) then -- don't bother with the rest of this if <script> is nil or empty string Line 1,029 ⟶ 1,032: else -- when text is not wholly Latn script if scripts_t[script] then -- but a Latn script is specified return substitute (cfg.text_script_match_test_t.latn_scr_mismatch, {pos, char}); -- emit an error message with position of first offending character end end Line 1,036 ⟶ 1,039: --[[--------------------------< B I SN A R Y _ LS E A TR IC NH >~~-----------~~--------------------------------------------------- conducts a binary search of <ranges_t> for a sub-range that holds <target>. ~~Created because unicode does not have Latin theta 'θ' character. The Greek 'θ' is, apparently, commonly used~~ ~~in certain romanizations. This function attempts to discover if <text> script is Latn with Greek 'θ' or some~~ returns boolean true if a sub-range holding <target> is found; boolean false else. ~~other script with Greek 'θ'.~~ ~~when <text> is or has Greek 'θ':~~ ~~for {{lang}}, {{langx}}, and {{lang-??}}, returns boolean true when <text>:~~ ~~has theta and all other characters are Latn~~ ~~has theta as the only character, <script> is Latn~~ ~~for {{transliteration}}, returns boolean true when <text>:~~ ~~has theta and all other characters are Latn~~ ~~has theta as the only character~~ ]] local function ~~is_latin~~binary_search (~~text~~target, ~~script, transl~~ranges_t) ~~text~~local =idx_bot ~~mw.text.decode~~= ~~(text)~~1; -- ~~convert~~initialize ~~html~~to ~~entities~~index toof first ~~characters~~key local idx_top = sizeof_ranges_t; -- initialize to index of last key (number of keys) ~~script = (script and script:lower()) or nil; -- ensure lowercase for comparisons~~ ~~local whitelist = '[ʻʼʽʾʿΔαβγδθλσφχϑъьӾӿᾱῑ῾上入去平阳阴]'~~ if (target < ranges_t[idx_bot][1]) or (target > ranges_t[idx_top][2]) then -- invalid; target out of range ~~if not transl then -- testing {{lang}} or {{langx}} <text>~~ return; -- abandon ~~if '' == mw.ustring.gsub (text, whitelist, '') then -- when <text> uses only whitelisted characters~~ ~~return ('latn' == script); -- true when latn script; false when other script or no script~~ ~~end~~ end local idx_mid; -- calculated index of range midway between top index and bottom index ~~if mw.ustring.find (text, whitelist) then -- does <text> contain characters from the whitelist?~~ local flag = false; -- flag to tell us when we've evaluated last (highest) range in <ranges_t> ~~text = mw.ustring.gsub (text, whitelist, 'x'); -- replace whitelisted characters from <text> with known Latn-script char 'x'~~ ~~-- if 0 == text:len() then -- will be zero if theta was the only character in <text>~~ while 1 do ~~if '' == text then -- will be empty string if the only character(s) in <text> are in <whitelist>~~ idx_mid = math.ceil ((idx_bot + idx_top) / 2); -- get the mid-point in the <ranges_t> sequence ~~if transl then -- not nil for {{transliteration}}; assume Latn because this is 'transliteration' template~~ if (target >= ranges_t[idx_mid][1]) and (target <= ranges_t[idx_mid][2]) then -- indexed range low value <= target <= indexed range high value ~~return true;~~ return true; -- we found the range that holds the <target> character; return true ~~end~~ ~~return (script and ('latn' == script)) or false; -- {{lang}}, {{langx}}, and {{lang-??}}; true when script is Latn; false else~~ elseif (target > ranges_t[idx_mid][2]) then -- is <target> > indexed range high value? idx_bot = idx_mid; -- adjust <idx_bot> up else -- here when <target> less than indexed range low value idx_top = idx_mid - 1; -- adjust <idx_top> down end if flag then break; -- here when we just evaluated the last range and <target> not found end if not flag and (idx_bot == idx_top) then -- set true just before we evaluate the last (highest) range in <ranges_t> flag = true; end ~~return unicode.is_Latin (text); -- return true when all characters in modified <text> are Latn script; false else~~ end end ~~return unicode.is_Latin (text); -- return true when all characters in <text> are Latn script; false else~~ --[[--------------------------< I S _ L A T I N >-------------------------------------------------------------- compare <text> as codepoints to lists of known codepoints accepted as Latn script returns boolean true and modified <text> when <text> is wrapped in accept-as-written markup returns boolean true and <text> when codepoint is known returns boolean false, <text>, non-Latn codepoint position in <text> (left to right), and the codepoint character when codepoint is not known TODO: when text has accept-as-written markup, return a non-boolean value to indicate that <text> is not wholly latn script? Use that return value to create non-Latn html lang= attribute because <text> isn't really latn so lang=und (undetermined)? or instead, omit the -Latn subtag? (without -latn need to force \|italic=yes) ]] local function is_latin (text, tag) local count; text, count = text:gsub ('^%(%((.+)%)%)$', '%1'); -- remove accept-as-written markup if present if 0 ~= count then return true, text; -- markup present so assume that <text> is Latn-script end local pos = 0; -- position counter for error messaging for codepoint in mw.ustring.gcodepoint (text) do -- fetch each code point pos = pos + 1; -- bump the position counter if not is_latn_data.singles_t[codepoint] and -- codepoint not found in the singles list? not binary_search (codepoint, is_latn_data.ranges_t) and -- codepoint not a member of a listed range? not (tag and is_latn_data.specials_t[codepoint] and is_latn_data.specials_t[codepoint][tag]) then -- not a language-specific codepoint? return false, text, pos, mw.ustring.char (codepoint); -- codepoint not known; return false with codepoint position and character representation end end return true, text; -- is known; return <text> end Line 1,139 ⟶ 1,177: end local is_latn_text, pos ~~= is_latin (args.text~~, ~~subtags.script)~~char; ~~-- make a boolean~~ is_latn_text, args.text, pos, char= is_latin (args.text, args.code); -- make a boolean msg = text_script_match_test (subtags.script, is_latn_text, pos, char) if msg then -- if an error detected then there is an error message return make_error_msg (msg, args, template); Line 1,332 ⟶ 1,371: if args.translit then local latn, pos ~~= is_latin (args.translit, nil~~, ~~true)~~char; latn, args.translit, pos, char = is_latin (args.translit, args[1] or args.code); if not latn then return make_error_msg (substitute (cfg.lang_xx_t.translit_nonlatn, {pos, char}), args, template); end end Line 1,364 ⟶ 1,404: end local is_latn_text, text, pos, char = is_latin (args.text, ~~subtags.script~~code); -- make a boolean args.text = text; -- may have been modified (accept-as-written markup removed) msg = text_script_match_test (subtags.script, is_latn_text, pos, char) if msg then -- if an error detected then there is an error message return make_error_msg (msg, args, template); Line 1,860 ⟶ 1,901: end local ~~latn~~is_latn_text, pos ~~= is_latin (args.text, nil~~, ~~true)~~char; is_latn_text, args.text, pos, char= is_latin (args.text, args.code); -- is latn text? strip accept-as-written markup ~~if not latn then -- text is not latn~~ if not is_latn_text then -- when text is not latn ~~return make_error_msg (substitute (cfg.lang_xx_t.translit_nonlatn, {pos}), args, template);~~ return make_error_msg (substitute (cfg.lang_xx_t.translit_nonlatn, {pos, char}), args, template); -- abandon with error message end

Module:Lang: Difference between revisions