Module:Sandbox/Erutuon: Difference between revisions

Browse history interactively

← Previous edit

Content deleted Content added

Revision as of 20:46, 3 July 2018 edit Erutuon (talk \| contribs) Autopatrolled, Extended confirmed users 32,238 edits place invalid tag suffix in "invalid" field, use "error" to indicate both that an error was present and the type of error ← Previous edit		Latest revision as of 09:39, 16 September 2019 edit undo Erutuon (talk \| contribs) Autopatrolled, Extended confirmed users 32,238 edits function to print data module for "default ignorable" property
(45 intermediate revisions by 2 users not shown)
Line 1: local p = {} function p.show(frame) ~~local Unicode_data = require 'Module:Unicode data/sandbox'~~ local page = frame.args[1] or "User:Erutuon/Unicode/DerivedCoreProperties.txt" local text = assert(mw.title.new(page):getContent()) ~~local function errorf(level, ...)~~ local defaultIgnorable = text ~~if type(level) == number then~~ :match("Derived Property: Default_Ignorable_Code_Point.-(%f[^\n]%x%x%x%x.-)%s\n# Total code points") ~~return error(string.format(...), level + 1)~~ local singles, ranges = {}, {} ~~else -- level is actually the format string.~~ for codePoint1, codePoint2 in defaultIgnorable:gmatch("%f[^\n%z](%x+)%.?%.?(%x)") do ~~return error(string.format(level, ...), 2)~~ codePoint1, codePoint2 = tonumber(codePoint1, 16), tonumber(codePoint2, 16) ~~end~~ local lastRange = ranges[#ranges] ~~end~~ if lastRange and lastRange[2] == codePoint1 - 1 then lastRange[2] = codePoint2 or codePoint1 ~~function mw.logf(...)~~ else ~~return mw.log(string.format(...))~~ if not codePoint2 then ~~end~~ singles[codePoint1] = true else ~~local output_mt = {}~~ table.insert(ranges, { codePoint1, codePoint2 }) ~~function output_mt:insert(str)~~ end ~~self.n = self.n + 1~~ end ~~self[self.n] = str~~ ~~end~~ ~~-- also in [[Module:Unicode data/documentation functions]]~~ ~~function output_mt:insert_format(...)~~ ~~self:insert(string.format(...))~~ ~~end~~ ~~output_mt.join = table.concat~~ ~~output_mt.__index = output_mt~~ ~~local function Output()~~ ~~return setmetatable({ n = 0 }, output_mt)~~ ~~end~~ ~~local Latn_pattern = table.concat {~~ ~~'[',~~ ~~'\n\32-\127',~~ ~~'\194\160-\194\172',~~ ~~'\195\128-\195\191',~~ ~~'\196\128-\197\191',~~ ~~'\198\128-\201\143',~~ ~~'\225\184\128-\225\187\191',~~ ~~'\226\177\160-\226\177\191',~~ ~~'\234\156\160-\234\159\191',~~ ~~'\234\172\176-\234\173\175',~~ ~~'\239\172\128-\239\172\134',~~ ~~'\239\188\129-\239\188\188',~~ ~~'–',~~ ~~'—',~~ ~~'«', '»',~~ ~~']',~~ }; ~~local get_codepoint = mw.ustring.codepoint~~ ~~local function expand_range(start, ending)~~ ~~local lower, higher = get_codepoint(start), get_codepoint(ending)~~ ~~if higher < lower then~~ ~~return nil~~ ~~end~~ ~~local chars = {}~~ ~~local i = 0~~ ~~for codepoint = lower, higher do~~ ~~i = i + 1~~ ~~chars[i] = mw.ustring.char(codepoint)~~ ~~end~~ ~~return table.concat(chars)~~ ~~end~~ ~~local fun = require "Module:Fun"~~ ~~local m_table = require "Module:Table"~~ ~~local script_to_count_mt = {~~ ~~__index = function (self, key)~~ ~~self[key] = 0~~ ~~return 0~~ ~~end,~~ ~~__call = function (self, ...)~~ ~~return setmetatable({}, self)~~ ~~end~~ } ~~setmetatable(script_to_count_mt, script_to_count_mt)~~ ~~-- Uses an iterator (such as mw.ustring.gcodepoint) that generates a codepoint~~ ~~-- each time it is called with an optional state and another value.~~ ~~local function show_scripts(iterator, state, value)~~ ~~local script_to_count = script_to_count_mt()~~ ~~for codepoint in iterator, state, value do~~ ~~local script = Unicode_data.lookup_script(codepoint)~~ ~~script_to_count[script] = script_to_count[script] + 1~~ ~~end~~ ~~return table.concat(~~ ~~fun.mapIter(~~ ~~function (count, script)~~ ~~return ("%s (%d)"):format(script, count)~~ ~~end,~~ ~~m_table.sortedPairs(~~ ~~script_to_count,~~ ~~function (script1, script2)~~ ~~return script_to_count[script1] > script_to_count[script2]~~ ~~end)),~~ ~~", ")~~ ~~end~~ ~~local function get_chars_in_scripts(iterator, state, value)~~ ~~local script_to_char_set = {}~~ ~~for codepoint in iterator, state, value do~~ ~~local script = Unicode_data.lookup_script(codepoint)~~ ~~script_to_char_set[script] = script_to_char_set[script] or {}~~ ~~script_to_char_set[script][codepoint] = true~~ end local template = [[ ~~return script_to_char_set~~ local data = {} ~~end~~ data.defaultIgnorable = { ~~local function print_char_set_map(script_to_char_set, format, separator)~~ singles = { ~~format = format or "%s: %s"~~ ... ~~separator = separator or "\n"~~ }, ~~return table.concat(~~ ~~fun.mapIter(~~ ~~function (char_set, script)~~ ~~local char_list = fun.mapIter(~~ ~~function (_, codepoint)~~ ~~return mw.ustring.char(codepoint)~~ ~~end,~~ ~~m_table.sortedPairs(char_set))~~ ~~return (format):format(script, mw.text.nowiki(table.concat(char_list)))~~ ~~end,~~ ~~m_table.sortedPairs(script_to_char_set)),~~ ~~separator)~~ ~~end~~ ~~function p.show(frame)~~ ~~local expanded_pattern = Latn_pattern~~ ~~:gsub('%[(.-)%]', '%1')~~ ~~:gsub( -- Find two UTF-8-encoded characters separated by hyphen-minus.~~ ~~'([%z\1-\127\194-\244][\128-\191])%-([%z\1-\127\194-\244][\128-\191])',~~ ~~function (char1, char2)~~ ~~return expand_range(char1, char2)~~ ~~end)~~ ranges = { ~~return ('* <div style="overflow-wrap: break-word;">%s</div><br>%s')~~ ... ~~:format(expanded_pattern~~ }, ~~:gsub('^%s', ''), -- Remove initial '\n ' to avoid creating unwanted pre element.~~ } ~~show_scripts(mw.ustring.gcodepoint(expanded_pattern)))~~ ~~end~~ return data ~~local function get_block_info_from_arg(args, arg)~~ ]] ~~local block_name = args[1]~~ ~~or errorf("Parameter %s is required", tostring(arg))~~ ~~local block_info = Unicode_data.get_block_info(block_name)~~ ~~or errorf("The block '%s' could be found", block_name)~~ ~~return block_info~~ ~~end~~ local Array = require "Module:array" ~~local function get_boolean_from_arg(args, arg)~~ local printedRanges = Array() ~~return args[arg] and require "Module:Yesno" (args[arg])~~ for _, range in ipairs(ranges) do ~~end~~ local low, high, script_code = unpack(range) printedRanges:insert(('\t\t{ 0x%05X, 0x%05X },'):format(low, high)) ~~function p.scripts_in_block(frame)~~ ~~local block_info = get_block_info_from_arg(frame.args, 1)~~ ~~local show_block_name = get_boolean_from_arg(frame.args, 2)~~ ~~local script_list = show_scripts(fun.range(block_info[1], block_info[2]))~~ ~~if show_block_name then~~ ~~return ("%s: %s"):format(block_info[3], script_list)~~ ~~else~~ ~~return script_list~~ end ~~end~~ ~~local function link_block_name(block_name)~~ ~~if block_name:find ' ' then~~ ~~return ("[[%s]]"):format(block_name)~~ ~~else~~ ~~return ("[[%s (Unicode block)\|%s]]"):format(block_name, block_name)~~ ~~end~~ ~~end~~ ~~function p.scripts_in_blocks(frame)~~ ~~local output = Output()~~ ~~local start = frame.args[1] and tonumber(frame.args[1], 16) or 0~~ ~~local ending = frame.args[2] and tonumber(frame.args[2], 16) or 0x4000~~ local printedSingles = Array() ~~local script_data = mw.loadData "Module:Unicode data/scripts"~~ for codepoint in require 'Module:TableTools'.sortedPairs(singles) do ~~local singles = script_data.singles~~ printedSingles:insert(('\t\t[0x%05X] = true,'):format(codepoint)) ~~local ranges = script_data.ranges~~ ~~local function clear (self)~~ ~~for _, key in ipairs(m_table.keysToList(self, false)) do~~ ~~self[key] = nil~~ ~~end~~ end local ~~counts~~data = {}template :gsub('%.%.%.', printedSingles:concat('\n'), 1) ~~setmetatable(counts, {~~ :gsub('%.%.%.', printedRanges:concat('\n'), 1) ~~__index = {~~ ~~increment = function(self, script_code, amount)~~ ~~self[script_code] = (self[script_code] or 0) + (amount or 1)~~ ~~end,~~ ~~clear = clear,~~ } }) ~~local codepoints_per_script = {}~~ ~~setmetatable(codepoints_per_script, {~~ ~~__index = {~~ ~~add = function(self, script_code, codepoint)~~ ~~self[script_code] = self[script_code] or { n = 0 }~~ ~~if self[script_code].n <= 0x20~~ ~~and not (codepoint <= 0x9F and (codepoint >= 0x80~~ ~~or codepoint <= 0x1F)) then~~ ~~if self[script_code].n == 0x20 then~~ ~~local period = ('.'):byte()~~ ~~for _ = 1, 3 do~~ ~~self[script_code].n = self[script_code].n + 1~~ ~~self[script_code][self[script_code].n] = period~~ ~~end~~ ~~else~~ ~~if script_code == "Zinh" then -- probably combining character~~ ~~self[script_code].n = self[script_code].n + 1~~ ~~self[script_code][self[script_code].n] = 0x25CC~~ ~~end~~ ~~self[script_code].n = self[script_code].n + 1~~ ~~self[script_code][self[script_code].n] = codepoint~~ ~~end~~ ~~end~~ ~~end,~~ ~~clear = clear,~~ } }) return data ~~output:insert [[~~ ~~{\| class="wikitable"~~ ~~\|+ Scripts in each Unicode block~~ ~~! block !! codepoints !! scripts~~ ]] ~~for _, block in pairs(mw.loadData "Module:Unicode data/blocks") do~~ ~~local codepoint = block[1]~~ ~~if codepoint > ending then break end~~ ~~if codepoint >= start then~~ ~~while codepoint <= block[2] do~~ ~~local script = singles[codepoint]~~ ~~local count~~ ~~if script then -- Codepoint is in "singles" map.~~ ~~counts:increment(script)~~ ~~codepoints_per_script:add(script, codepoint)~~ ~~codepoint = codepoint + 1~~ ~~count = 1 -- for potential future use~~ ~~else~~ ~~local range, index = Unicode_data.binary_range_search(codepoint, ranges)~~ ~~if range then -- Codepoint is in "ranges" array.~~ ~~count = 0~~ ~~script = range[3]~~ ~~while codepoint <= range[2] and codepoint <= block[2] do~~ ~~count = count + 1~~ ~~codepoints_per_script:add(script, codepoint)~~ ~~codepoint = codepoint + 1~~ ~~end~~ ~~counts:increment(script, count)~~ ~~else -- Codepoint doesn't have data; it's Zzzz.~~ ~~-- Get range immediately above codepoint.~~ ~~while ranges[index][2] < codepoint do~~ ~~index = index + 1~~ ~~end~~ ~~count = 0~~ ~~script = "Zzzz"~~ ~~local range = ranges[index]~~ ~~while codepoint < range[1] and codepoint <= block[2]~~ ~~and not singles[codepoint] do~~ ~~count = count + 1~~ ~~codepoint = codepoint + 1~~ ~~end~~ ~~counts:increment(script, count)~~ ~~end~~ ~~end~~ ~~end~~ ~~output:insert_format([[~~ \|- ~~\| %s~~ ~~\| U+%04X–U+%04X~~ ~~\| %s~~ ~~]], link_block_name(block[3]), block[1], block[2],~~ ~~table.concat(~~ ~~fun.map(~~ ~~function (count, script)~~ ~~return ('<abbr title="%s">%s</abbr> (<span title="%s">%d</span>)')~~ ~~:format(~~ ~~script_data.aliases[script], script,~~ ~~codepoints_per_script[script]~~ ~~and mw.text.nowiki(mw.ustring.char(~~ ~~unpack(codepoints_per_script[script])))~~ ~~or "",~~ ~~count)~~ ~~end,~~ ~~m_table.sortedPairs(~~ ~~counts,~~ ~~function (script1, script2)~~ ~~return counts[script1] > counts[script2]~~ ~~end)),~~ ~~", "))~~ ~~end~~ ~~-- mw.logObject(codepoints_per_script, block[3])~~ ~~counts:clear()~~ ~~codepoints_per_script:clear()~~ ~~end~~ ~~output:insert "\|}"~~ ~~return output:join()~~ end local Unicode_data = require "Module:Unicode data/sandbox" ~~function p.chars_in_scripts_in_block(frame)~~ local fun = require "Module:fun" ~~local block_info = get_block_info_from_arg(frame.args, 1)~~ local m_table = require "Module:TableTools" ~~local show_block_name = get_boolean_from_arg(frame.args, 2)~~ ~~local script_char_set_map = print_char_set_map(~~ local function errorf(level, ...) ~~get_chars_in_scripts(fun.range(block_info[1], block_info[2])))~~ if ~~show_block_name~~type(level) == "number" then return error(~~"%s: %s"):~~string.format(~~block_info[3]~~...), ~~script_char_set_map~~level + 1) else -- level is actually the format string. ~~else~~ return ~~script_char_set_map~~error(string.format(level, ...), 2) end end Line 340 ⟶ 80: local language_codes = {} for lang_template in content:gmatch '"{{lang[^}]+'" do local template_name = lang_template:match('"{{([^\|}]+)'") local language_code if template_name == '"lang'" then language_code = lang_template:match '"{{lang\|([^\|}]+)'" elseif template_name:find '"^lang-'" then language_code = lang_template:match '"{{lang-([^\|}]+)'" end if language_code then Line 353 ⟶ 93: end return table.concat(m_table.keysToList(language_codes), '", '") end local parsed_subtags_mt = { ~~-- A previous draft, in [[Module:Lang/sandbox]].~~ __index = { -- "error" is the error message. -- "index" is the ordinal of the subtag in which the error was found. throw = function (self, error, index) self.error = self.error_messages[error] self.invalid = table.concat(self.input, "-", index) return self:remove_unnecessary_fields() end, remove_unnecessary_fields = function (self) -- Only useful internally. self.input = nil self:pretty_print() p.validate_lang_tag(self) return self end, -- Regularize capitalization of language subtags: -- ZH-LATN -> zh-Latn, FR-ca -> fr-CA pretty_print = function (self) for key, func in pairs(self.print_funcs) do if self[key] then self[key] = func(self[key]) end end return self end, -- Re-create the original tag from the parsed subtags. get_tag = function (self) if self.tag then return self.tag end local tag = {} for _, subtag_name in ipairs(self.subtag_order) do if subtag_name == "private_use" then table.insert(tag, "x") end if type(self[subtag_name]) == "table" then for _, subtag in ipairs(self[subtag_name]) do table.insert(tag, subtag) end else table.insert(tag, self[subtag_name]) end end tag = table.concat(tag, "-") self.tag = tag -- Cache the result. return tag end, subtag_order = { "language", "script", "region", "variant", "private_use" }, error_messages = { invalid_characters = "invalid characters", no_language = "no language subtag", invalid_subtag = "invalid subtag", invalid_private_use = "length of private-use subtag out of range", empty_private_use = "empty private-use subtag", } } } local function initial_caps_helper(initial, rest) return string.upper(initial) .. string.lower(rest) end local function lower_or_map_lower(str) if type(str) == "table" then return fun.map(string.lower, str) else return string.lower(str) end end parsed_subtags_mt.__index.print_funcs = { language = string.lower, script = function (script_code) return (string.gsub(script_code, "^(%a)(%a%a%a)$", initial_caps_helper)) end, region = string.upper, variant = lower_or_map_lower, private_use = lower_or_map_lower, } setmetatable(parsed_subtags_mt, { __call = function (self, input) return setmetatable({ input = input }, self) end }) -- An array of patterns for each subtag, and a "type" field for the name -- of the subtag. -- The patterns are checked in order, and any of the subtags can be skipped. -- So, for example, the "language" subtag must precede the "script" -- subtag, but a tag may contain a "language" subtag, no "script" subtag -- and then a "region" subtag. -- If the full list of subtags has been iterated over, the remaining subtags -- must match the pattern for a private-use subtag, or the tag is invalid. local subtag_info = { -- can be put in data module { "%a%a%a?", "1%a+", type = "language" }, -- ll or lll; special case -- include extlang? { "%a%a%a%a", type = "script" }, -- Ssss { "%a%a", "%d%d%d", type = "region" }, -- rr, DDD { "%d%d%d%d", -- 4 digits "%w%w%w%w%w%w?%w?%w?", -- 5-8 alnum characters type = "variant", repeatable = true, -- There can be multiple variants. } } -- A previous draft, in [[Module:Lang/sandbox]]: -- https://en.wikipedia.org/w/index.php?oldid=812819217 -- Based on https://www.w3.org/International/articles/language-tags/. -- Parse a language tag. -- Returns nil if tag is not a string or empty. -- Else returns a table with a map of subtag type to subtag for all subtags that -- were parsed. -- If there was an error, returns an "error" field with a description of the -- error, and an "invalid" field with the suffix of the tag starting at the -- index where the error occurred. -- Does not recognize "extension" tags, such as those introduced by "u", as they -- are not needed on Wikipedia. Does not recognize "grandfathered" tags. -- Does not recognize extended language subtags, such as "zh-yue". -- https://www.rfc-editor.org/rfc/rfc6067.txt, https://tools.ietf.org/html/bcp47 -- Only checks that the syntax is correct, not that the values are valid. For -- instance, will accept non-existent language codes, like "zz". function p.parse_IETF(tag) ~~local~~if ~~subtags~~type(tag) ~= ~~mw.text.split(~~"string" or tag, == "-") then return nil end -- This ~~contains~~may contain the special fields "~~matched_count~~invalid" ~~and~~, "~~invalid~~error". -- ~~"matched_count" tracks the number of subtags,~~ "error" indicates why the -- tag is invalid (if applicable). -- All other fields are subtags, and they appear in the tag in the following -- order: -- "language", "script", "region", "variant", "private_use", "invalid" -- All these subtags can be strings or nil, while "variant" can also be an -- array of strings if more than one variant subtag was found. -- "invalid" is the portion of the tag after the last valid subtag (minus a -- hyphen). local segments = mw.text.split(tag, "-") ~~local parsed_subtags = { matched_count = 0 }~~ local parsed_subtags = parsed_subtags_mt(segments) -- Language tags probably only contain ASCII alphabetic and numerical ~~-- An array of patterns for each subtag, and a "type" field for the name~~ -- ofcharacters ~~the~~and ~~subtag~~hyphen-minus. if not tag:find "^[A-Za-z0-9-]+$" then ~~-- The patterns are checked in order, and any of the subtags can be skipped.~~ return parsed_subtags:throw( ~~-- So, for example, the "language" subtag must precede the "script"~~ "invalid_characters", ~~-- subtag, but a tag may contain a "language" subtag, no "script" subtag~~ fun.indexOf( ~~-- and then a "region" subtag.~~ function (tag) ~~-- If the full list of subtags has been iterated over, the remaining subtags~~ return tag:find "[^A-Za-z0-9-]" ~~-- must match the pattern for a private-use subtag, or the tag is invalid.~~ end, ~~local subtag_info = { -- can be put in data module~~ segments)) ~~{ "%a%a%a?", "1%a+", type = "language" }, -- ll or lll; special case~~ end ~~-- include extlang?~~ ~~{ "%a%a%a%a", type = "script" }, -- Ssss~~ ~~{ "%a%a", "%d%d%d", type = "region" }, -- rr, DDD~~ { ~~"%d%d%d%d", -- 4 digits~~ ~~"%w%w%w%w%w%w?%w?%w?", -- 5-8 alnum characters~~ ~~type = "variant"~~ } } local subtag_i = 1 -- Index of current item in subtag_info. ~~local index = 1~~ local segment_i = 1 -- Index of current segment. ~~local last_matched_subtag_i = 0~~ while segments[segment_i] and subtag_info[subtag_i] do ~~for subtag_i, subtag in ipairs(subtags) do~~ local ~~type~~segment = segments[segment_i] local ~~matched = false~~subtag_type while not ~~matched~~subtag_type and subtag_info[subtag_i] do -- Check each pattern for the subtag type at "~~index~~subtag_i" in "subtag_info". ~~for~~local _,cur_subtag ~~pattern~~= ~~in ipairs(~~subtag_info[~~index~~subtag_i]~~) do~~ iffor ~~subtag:find("^" ..~~_, pattern ..in ~~"$"~~ipairs(cur_subtag) ~~then~~do if segment:find("^" .. pattern .. "$") then ~~type = subtag_info[index].type~~ ~~matched~~subtag_type = ~~true~~cur_subtag.type -- There can be multiple "variant" subtags (and "extension" -- subtags, if those are added). if not cur_subtag.repeatable then subtag_i = subtag_i + 1 end break end end ~~if not matched then -- Go to next item in subtag_info.~~ if not subtag_type then -- No match; try next subtag. ~~index = index + 1~~ subtag_i = subtag_i + 1 ~~if not subtag_info[index] then~~ ~~break~~ ~~end~~ end end -- If language subtag has not been found, or the current segment has not ~~if type then~~ -- been matched as a subtag, break the loop and check for ~~parsed_subtags[type] = subtag~~ -- a private-use subtag. ~~last_matched_subtag_i = subtag_i~~ if segment_i == 1 and subtag_type ~= "language" or not subtag_type then ~~parsed_subtags.matched_count = parsed_subtags.matched_count + 1~~ ~~elseif not subtag_info[index] then~~ break else if parsed_subtags[subtag_type] then -- Create an array. if type(parsed_subtags[subtag_type]) == "string" then parsed_subtags[subtag_type] = { parsed_subtags[subtag_type] } end -- else table table.insert(parsed_subtags[subtag_type], segment) else parsed_subtags[subtag_type] = segment end last_matched_segment_i = segment_i end segment_i = segment_i + 1 end if segments[segment_i] then -- More segments to scan? ~~if #subtags > parsed_subtags.matched_count then~~ -- Not all potential subtags were matched. ~~The~~Check ~~unmatched~~for ~~tail~~private-use ~~end of the tag~~subtags. -- https://tools.ietf.org/html/bcp47#section-2.2.7 ~~-- (after the subtag at the index last_matched_subtag_i) is a~~ -- ~~private~~Private-use ~~subtag~~subtags ifconsist itof ~~starts~~one ~~with~~or ~~"x".~~more ~~Otherwise,~~sequences ~~the~~of ~~tag~~1 isto 8 -- alphanumeric characters preceded by "x-". ~~-- invalid.~~ -- Alphanumericity has already been checked. ~~local suffix = table.concat(subtags, "-", last_matched_subtag_i + 1)~~ ~~if subtags[last_matched_subtag_i + 1] == "x" then~~ -- A tag must start with either a language subtag or a private-use subtag. ~~parsed_subtags.private_use = suffix~~ -- If next segment is not "x", introducing a private-use subtag, there ~~parsed_subtags.matched_count = parsed_subtags.matched_count + 1~~ -- is no private-use subtag. if segments[segment_i] and segments[segment_i]:lower() ~= "x" then if not parsed_subtags.language then return parsed_subtags:throw("no_language", 1) else return parsed_subtags:throw("invalid_subtag", segment_i) end elseif not segments[segment_i + 1] then return parsed_subtags:throw("empty_private_use", segment_i) end -- Check length of all segments after "x". for i = segment_i + 1, #segments do local length = #segments[i] if not (1 <= length and length <= 8) then return parsed_subtags :throw("invalid_private_use", segment_i) end end if not segments[last_matched_segment_i + 3] then -- There is only one private-use subtag. parsed_subtags.private_use = segments[segment_i + 1] else parsed_subtags.~~invalid~~private_use = ~~suffix~~{} for i = segment_i + 1, #segments do ~~parsed_subtags.error = "invalid subtag"~~ table.insert(parsed_subtags.private_use, segments[i]) end end end ~~if not~~return parsed_subtags~~.language then~~:remove_unnecessary_fields() end ~~parsed_subtags.error = "no language"~~ local lang_name_table = mw.loadData "Module:Language/name/data" local synonym_table = mw.loadData "Module:Lang/ISO 639 synonyms" local lang_data = mw.loadData "Module:Lang/data" function p.validate_lang_tag(parsed_subtags) -- Already checked that the tag starts with a language subtag or a private-use subtag. -- Script code is initially capitalized, region code is uppercase, -- everything else is lowercase. -- Check existence of language tag. if parsed_subtags.language and not (lang_data.override[parsed_subtags.language] or lang_name_table.lang[parsed_subtags.language]) then mw.log("Invalid language code", parsed_subtags.language, "in", parsed_subtags:get_tag()) end -- Check existence of script tag. ~~return parsed_subtags~~ if parsed_subtags.script then local lower_script = parsed_subtags.script:lower() if not lang_name_table.script[lower_script] then mw.log("Invalid script code", parsed_subtags.script, "in", parsed_subtags:get_tag()) end -- Check that script tag is not marked as superfluous (because the -- it is considered the default one for the language). if lang_name_table.suppressed[lower_script] and parsed_subtags.language and m_table.inArray( lang_name_table.suppressed[lower_script], parsed_subtags.language:lower()) then mw.log(parsed_subtags.script, "is suppressed with", parsed_subtags.language, "in", parsed_subtags:get_tag()) end end -- Check existence of region code.. if parsed_subtags.region and not lang_name_table.region[parsed_subtags.region:lower()] then mw.log("Invalid region code", parsed_subtags.region, "in", parsed_subtags:get_tag()) end -- Check that variant code is valid, and that it can validly be used with the -- given combination of language, script, region, and variant. -- Check for duplicate variant subtags? if parsed_subtags.variant then local lower_tag = parsed_subtags:get_tag():lower() for _, variant in ipairs(type(parsed_subtags.variant) == "table" and parsed_subtags.variant or { parsed_subtags.variant }) do if not lang_name_table.variant[variant] then mw.log("Invalid variant code", variant, "in", parsed_subtags:get_tag()) else local prefix = parsed_subtags:get_tag():lower():match("^(.-)%-" .. variant) -- Check that at least one of the prefixes is found at the -- beginning of lower_tag. if not fun.some(function (prefix) return lower_tag:find(prefix, 1, true) == 1 end, lang_name_table.variant[variant].prefixes) then mw.log("Variant tag", variant, "does not belong with prefix", prefix, "in", parsed_subtags:get_tag()) end end end end -- Check that the private-use subtag is actually used by Wikipedia. if parsed_subtags.private_use and not lang_data.override[parsed_subtags.tag] then mw.log("Invalid private-use subtag in", parsed_subtags:get_tag()) end end function p.show_COinS(frame) local ref = frame.args[1] local tag = ref:match('<span [^>]class="Z3988"[^>]>') local data = tag:match('title="(.-)"') local vals = {} for item in mw.text.gsplit(data, "&") do local key, value = item:match("(.-)=(.)") vals[key] = mw.uri.decode(value) end return ref .. "\n\n" .. table.concat( require "Module:fun".mapIter( function (value, key) return ("%s: %s"):format(key, value) end, m_table.sortedPairs( vals)), ", ") end