Module:Sandbox/Erutuon: Difference between revisions

Browse history interactively

← Previous edit

Content deleted Content added

Revision as of 23:47, 1 July 2018 edit Erutuon (talk \| contribs) Autopatrolled, Extended confirmed users 32,238 edits allow inputting of pagename; error catching ← Previous edit		Latest revision as of 09:39, 16 September 2019 edit undo Erutuon (talk \| contribs) Autopatrolled, Extended confirmed users 32,238 edits function to print data module for "default ignorable" property
(64 intermediate revisions by 2 users not shown)
Line 1: local p = {} function p.show(frame) ~~local Unicode_data = require 'Module:Unicode data'~~ local page = frame.args[1] or "User:Erutuon/Unicode/DerivedCoreProperties.txt" local text = assert(mw.title.new(page):getContent()) local defaultIgnorable = text :match("Derived Property: Default_Ignorable_Code_Point.-(%f[^\n]%x%x%x%x.-)%s\n# Total code points") local singles, ranges = {}, {} for codePoint1, codePoint2 in defaultIgnorable:gmatch("%f[^\n%z](%x+)%.?%.?(%x)") do codePoint1, codePoint2 = tonumber(codePoint1, 16), tonumber(codePoint2, 16) local lastRange = ranges[#ranges] if lastRange and lastRange[2] == codePoint1 - 1 then lastRange[2] = codePoint2 or codePoint1 else if not codePoint2 then singles[codePoint1] = true else table.insert(ranges, { codePoint1, codePoint2 }) end end end local template = [[ local data = {} data.defaultIgnorable = { singles = { ... }, ranges = { ... }, } return data ]] local Array = require "Module:array" local printedRanges = Array() for _, range in ipairs(ranges) do local low, high, script_code = unpack(range) printedRanges:insert(('\t\t{ 0x%05X, 0x%05X },'):format(low, high)) end local printedSingles = Array() for codepoint in require 'Module:TableTools'.sortedPairs(singles) do printedSingles:insert(('\t\t[0x%05X] = true,'):format(codepoint)) end local data = template :gsub('%.%.%.', printedSingles:concat('\n'), 1) :gsub('%.%.%.', printedRanges:concat('\n'), 1) return data end local Unicode_data = require "Module:Unicode data/sandbox" local fun = require "Module:fun" local m_table = require "Module:TableTools" local function errorf(level, ...) if type(level) == "number" then return error(string.format(...), level + 1) else -- level is actually the format string. Line 11 ⟶ 68: end function p.search_for_language_codes(frame) ~~local Latn_pattern = table.concat {~~ local page_name = frame.args[1] or "English language" ~~'[', -- this is a set so include opening bracket~~ ~~'\n\32-\127', -- C0 Controls and Basic Latin U+0020–U+007E (20 - 7E) + (U+0010 and U+007F <poem>...</poem> support)~~ local success, title_object = pcall(mw.title.new, page_name) ~~'\194\160-\194\172', -- C1 Controls and Latin-1 Supplement U+00A0-U+00AC (C2 A0 - C2 AC)~~ if not (success and title_object) then ~~'\195\128-\195\191', -- (skip shy) U+00C0–U+00FF (C3 80 - C3 BF)~~ mw.logf("Could not make title object for '%s'.", page_name) ~~'\196\128-\197\191', -- Latin Extended-A U+0100–U+017F (C4 80 - C5 BF)~~ return ~~'\198\128-\201\143', -- Latin Extended-B U+0180–U+024F (C6 80 - C9 8F)~~ ~~'\225\184\128-\225\187\191', -- Latin Extended Additional U+1E00-U+1EFF (E1 B8 80 - E1 BB BF)~~ ~~'\226\177\160-\226\177\191', -- Latin Extended-C U+2C60–U+2C7F (E2 B1 A0 - E2 B1 BF)~~ ~~'\234\156\160-\234\159\191', -- Latin Extended-D U+A720-U+A7FF (EA 9C A0 - EA 9F BF)~~ ~~'\234\172\176-\234\173\175', -- Latin Extended-E U+AB30-U+AB6F (EA AC B0 - EA AD AF)~~ ~~'\239\172\128-\239\172\134', -- Alphabetic Presentaion Forms U+FB00-U+FB06 (EF AC 80 - EF AC 86)~~ ~~'\239\188\129-\239\188\188', -- Halfwidth and Fullwidth Forms U+FF01-U+FF3C (EF BC 81 - EF BC BC)~~ ~~'–', -- ndash~~ ~~'—', -- mdash~~ ~~'«', '»', -- guillemets commonly used in several 'Latn' languages~~ ~~']', -- close the set~~ }; ~~local get_codepoint = mw.ustring.codepoint~~ ~~local function expand_range(start, ending)~~ ~~local lower, higher = get_codepoint(start), get_codepoint(ending)~~ ~~if higher < lower then~~ ~~return nil~~ end ~~local chars = {}~~ local icontent = 0title_object:getContent() ~~for codepoint = lower, higher do~~ local language_codes = {} ~~i = i + 1~~ for lang_template in content:gmatch "{{lang[^}]+" do ~~chars[i] = mw.ustring.char(codepoint)~~ local template_name = lang_template:match("{{([^\|}]+)") local language_code if template_name == "lang" then language_code = lang_template:match "{{lang\|([^\|}]+)" elseif template_name:find "^lang-" then language_code = lang_template:match "{{lang-([^\|}]+)" end if language_code then language_codes[language_code] = true end end ~~return table.concat(chars)~~ return table.concat(m_table.keysToList(language_codes), ", ") end local parsed_subtags_mt = { ~~local fun = require "Module:Fun"~~ __index = { ~~local m_table = require "Module:Table"~~ -- "error" is the error message. -- "index" is the ordinal of the subtag in which the error was found. ~~local script_to_count_mt = {~~ ~~__index~~ throw = function (self, ~~key~~error, index) self~~[key]~~.error = 0self.error_messages[error] self.invalid = table.concat(self.input, "-", index) ~~return 0~~ return self:remove_unnecessary_fields() ~~end,~~ end, ~~__call = function (self, ...)~~ ~~return setmetatable({}, self)~~ remove_unnecessary_fields = function (self) ~~end~~ -- Only useful internally. self.input = nil self:pretty_print() p.validate_lang_tag(self) return self end, -- Regularize capitalization of language subtags: -- ZH-LATN -> zh-Latn, FR-ca -> fr-CA pretty_print = function (self) for key, func in pairs(self.print_funcs) do if self[key] then self[key] = func(self[key]) end end return self end, -- Re-create the original tag from the parsed subtags. get_tag = function (self) if self.tag then return self.tag end local tag = {} for _, subtag_name in ipairs(self.subtag_order) do if subtag_name == "private_use" then table.insert(tag, "x") end if type(self[subtag_name]) == "table" then for _, subtag in ipairs(self[subtag_name]) do table.insert(tag, subtag) end else table.insert(tag, self[subtag_name]) end end tag = table.concat(tag, "-") self.tag = tag -- Cache the result. return tag end, subtag_order = { "language", "script", "region", "variant", "private_use" }, error_messages = { invalid_characters = "invalid characters", no_language = "no language subtag", invalid_subtag = "invalid subtag", invalid_private_use = "length of private-use subtag out of range", empty_private_use = "empty private-use subtag", } } } local function initial_caps_helper(initial, rest) ~~setmetatable(script_to_count_mt, script_to_count_mt)~~ return string.upper(initial) .. string.lower(rest) end ~~-- Uses an iterator (such as mw.ustring.gcodepoint) that generates a codepoint~~ local function lower_or_map_lower(str) ~~-- each time it is called with an optional state and another value.~~ if type(str) == "table" then ~~local function show_scripts(iterator, state, value)~~ return fun.map(string.lower, str) ~~local script_to_count = script_to_count_mt()~~ else ~~for codepoint in iterator, state, value do~~ return string.lower(str) ~~local script = Unicode_data.lookup_script(codepoint)~~ ~~script_to_count[script] = script_to_count[script] + 1~~ end ~~return table.concat(~~ ~~fun.mapIter(~~ ~~function (count, script)~~ ~~return ("%s (%d)"):format(script, count)~~ ~~end,~~ ~~m_table.sortedPairs(~~ ~~script_to_count,~~ ~~function (script1, script2)~~ ~~return script_to_count[script1] > script_to_count[script2]~~ ~~end)),~~ ~~", ")~~ end parsed_subtags_mt.__index.print_funcs = { language = string.lower, script = function (script_code) return (string.gsub(script_code, "^(%a)(%a%a%a)$", initial_caps_helper)) end, region = string.upper, variant = lower_or_map_lower, private_use = lower_or_map_lower, } setmetatable(parsed_subtags_mt, { ~~local function get_chars_in_scripts(iterator, state, value)~~ __call = function (self, input) ~~local script_to_char_set = {}~~ return setmetatable({ input = input }, self) ~~for codepoint in iterator, state, value do~~ ~~local script = Unicode_data.lookup_script(codepoint)~~ ~~script_to_char_set[script] = script_to_char_set[script] or {}~~ ~~script_to_char_set[script][codepoint] = true~~ end }) -- An array of patterns for each subtag, and a "type" field for the name ~~return script_to_char_set~~ -- of the subtag. ~~end~~ -- The patterns are checked in order, and any of the subtags can be skipped. -- So, for example, the "language" subtag must precede the "script" -- subtag, but a tag may contain a "language" subtag, no "script" subtag -- and then a "region" subtag. -- If the full list of subtags has been iterated over, the remaining subtags -- must match the pattern for a private-use subtag, or the tag is invalid. local subtag_info = { -- can be put in data module { "%a%a%a?", "1%a+", type = "language" }, -- ll or lll; special case -- include extlang? { "%a%a%a%a", type = "script" }, -- Ssss { "%a%a", "%d%d%d", type = "region" }, -- rr, DDD { "%d%d%d%d", -- 4 digits "%w%w%w%w%w%w?%w?%w?", -- 5-8 alnum characters type = "variant", repeatable = true, -- There can be multiple variants. } } -- A previous draft, in [[Module:Lang/sandbox]]: ~~local function print_char_set_map(script_to_char_set, format, separator)~~ -- https://en.wikipedia.org/w/index.php?oldid=812819217 ~~format = format or "%s: %s"~~ ~~separator = separator or "\n"~~ ~~return table.concat(~~ ~~fun.mapIter(~~ ~~function (char_set, script)~~ ~~local char_list = fun.mapIter(~~ ~~function (_, codepoint)~~ ~~return mw.ustring.char(codepoint)~~ ~~end,~~ ~~m_table.sortedPairs(char_set))~~ ~~return (format):format(script, mw.text.nowiki(table.concat(char_list)))~~ ~~end,~~ ~~m_table.sortedPairs(script_to_char_set)),~~ ~~separator)~~ ~~end~~ -- Based on https://www.w3.org/International/articles/language-tags/. ~~function p.show(frame)~~ ~~local expanded_pattern = Latn_pattern~~ -- Parse a language tag. ~~:gsub('%[(.-)%]', '%1')~~ -- Returns nil if tag is not a string or empty. ~~:gsub( -- Find two UTF-8-encoded characters separated by hyphen-minus.~~ -- Else returns a table with a map of subtag type to subtag for all subtags that ~~'([%z\1-\127\194-\244][\128-\191])%-([%z\1-\127\194-\244][\128-\191])',~~ -- were parsed. ~~function (char1, char2)~~ -- If there was an error, returns an "error" field with a description of the ~~return expand_range(char1, char2)~~ -- error, and an "invalid" field with the suffix of the tag starting at the ~~end)~~ -- index where the error occurred. -- Does not recognize "extension" tags, such as those introduced by "u", as they -- are not needed on Wikipedia. Does not recognize "grandfathered" tags. -- Does not recognize extended language subtags, such as "zh-yue". -- https://www.rfc-editor.org/rfc/rfc6067.txt, https://tools.ietf.org/html/bcp47 -- Only checks that the syntax is correct, not that the values are valid. For -- instance, will accept non-existent language codes, like "zz". function p.parse_IETF(tag) if type(tag) ~= "string" or tag == "" then return nil end -- This may contain the special fields "invalid", "error". ~~--[[~~ -- "error" indicates why the ~~mw.log(~~ -- tag is invalid (if applicable). ~~print_char_set_map(~~ -- All other fields are subtags, and they appear in the tag in the following ~~get_chars_in_scripts(~~ -- order: ~~expanded_pattern:gmatch "[%z\1-\127\194-\244][\128-\191]")))~~ -- "language", "script", "region", "variant", "private_use", "invalid" ~~--]]~~ -- All these subtags can be strings or nil, while "variant" can also be an -- array of strings if more than one variant subtag was found. -- "invalid" is the portion of the tag after the last valid subtag (minus a -- hyphen). local segments = mw.text.split(tag, "-") local parsed_subtags = parsed_subtags_mt(segments) -- Language tags probably only contain ASCII alphabetic and numerical ~~return (' <div style="overflow-wrap: break-word;">%s</div><br>%s')~~ -- characters and hyphen-minus. ~~:format(expanded_pattern~~ if not tag:find "^[A-Za-z0-9-]+$" then ~~:gsub('^%s', ''), -- Remove initial '\n ' to avoid creating unwanted pre element.~~ return parsed_subtags:throw( ~~show_scripts(mw.ustring.gcodepoint(expanded_pattern)))~~ "invalid_characters", ~~end~~ fun.indexOf( ~~local~~ function ~~get_block_info_from_arg~~(~~args, arg~~tag) return tag:find "[^A-Za-z0-9-]" ~~local block_name = args[1]~~ end, ~~or errorf("Parameter %s is required", tostring(arg))~~ segments)) end local subtag_i = 1 -- Index of current item in subtag_info. ~~local block_info = Unicode_data.get_block_info(block_name)~~ local segment_i = 1 -- Index of current segment. ~~or errorf("The block '%s' could be found", block_name)~~ while segments[segment_i] and subtag_info[subtag_i] do local segment = segments[segment_i] local subtag_type while not subtag_type and subtag_info[subtag_i] do -- Check each pattern for the subtag type at "subtag_i" in "subtag_info". local cur_subtag = subtag_info[subtag_i] for _, pattern in ipairs(cur_subtag) do if segment:find("^" .. pattern .. "$") then subtag_type = cur_subtag.type -- There can be multiple "variant" subtags (and "extension" -- subtags, if those are added). if not cur_subtag.repeatable then subtag_i = subtag_i + 1 end break end end if not subtag_type then -- No match; try next subtag. subtag_i = subtag_i + 1 end end -- If language subtag has not been found, or the current segment has not -- been matched as a subtag, break the loop and check for -- a private-use subtag. if segment_i == 1 and subtag_type ~= "language" or not subtag_type then break else if parsed_subtags[subtag_type] then -- Create an array. if type(parsed_subtags[subtag_type]) == "string" then parsed_subtags[subtag_type] = { parsed_subtags[subtag_type] } end -- else table table.insert(parsed_subtags[subtag_type], segment) else parsed_subtags[subtag_type] = segment end last_matched_segment_i = segment_i end segment_i = segment_i + 1 end if segments[segment_i] then -- More segments to scan? ~~return block_info~~ -- Not all potential subtags were matched. Check for private-use subtags. -- https://tools.ietf.org/html/bcp47#section-2.2.7 -- Private-use subtags consist of one or more sequences of 1 to 8 -- alphanumeric characters preceded by "x-". -- Alphanumericity has already been checked. -- A tag must start with either a language subtag or a private-use subtag. -- If next segment is not "x", introducing a private-use subtag, there -- is no private-use subtag. if segments[segment_i] and segments[segment_i]:lower() ~= "x" then if not parsed_subtags.language then return parsed_subtags:throw("no_language", 1) else return parsed_subtags:throw("invalid_subtag", segment_i) end elseif not segments[segment_i + 1] then return parsed_subtags:throw("empty_private_use", segment_i) end -- Check length of all segments after "x". for i = segment_i + 1, #segments do local length = #segments[i] if not (1 <= length and length <= 8) then return parsed_subtags :throw("invalid_private_use", segment_i) end end if not segments[last_matched_segment_i + 3] then -- There is only one private-use subtag. parsed_subtags.private_use = segments[segment_i + 1] else parsed_subtags.private_use = {} for i = segment_i + 1, #segments do table.insert(parsed_subtags.private_use, segments[i]) end end end return parsed_subtags:remove_unnecessary_fields() end ~~local function get_boolean_from_arg(args, arg)~~ ~~return args[arg] and require "Module:Yesno" (args[arg])~~ ~~end~~ local lang_name_table = mw.loadData "Module:Language/name/data" ~~function p.scripts_in_block(frame)~~ local synonym_table = mw.loadData "Module:Lang/ISO 639 synonyms" ~~local block_info = get_block_info_from_arg(frame.args, 1)~~ local lang_data = mw.loadData "Module:Lang/data" ~~local show_block_name = get_boolean_from_arg(frame.args, 2)~~ ~~local script_list = show_scripts(fun.range(block_info[1], block_info[2]))~~ ~~if show_block_name then~~ ~~return ("%s: %s"):format(block_info[3], script_list)~~ ~~else~~ ~~return script_list~~ ~~end~~ ~~end~~ function p.~~chars_in_scripts_in_block~~validate_lang_tag(~~frame~~parsed_subtags) -- Already checked that the tag starts with a language subtag or a private-use subtag. ~~local block_info = get_block_info_from_arg(frame.args, 1)~~ -- Script code is initially capitalized, region code is uppercase, ~~local show_block_name = get_boolean_from_arg(frame.args, 2)~~ -- everything else is lowercase. ~~local script_char_set_map = print_char_set_map(~~ ~~get_chars_in_scripts(fun.range(block_info[1], block_info[2])))~~ -- Check existence of language tag. ~~if show_block_name then~~ if parsed_subtags.language and ~~return ("%s: %s"):format(block_info[3], script_char_set_map)~~ not (lang_data.override[parsed_subtags.language] ~~else~~ or lang_name_table.lang[parsed_subtags.language]) then ~~return script_char_set_map~~ mw.log("Invalid language code", parsed_subtags.language, "in", parsed_subtags:get_tag()) end ~~end~~ ~~function p.search_for_language_codes(frame)~~ ~~local page_name = frame.args[1] or "English language"~~ -- Check existence of script tag. ~~local success, title_object = pcall(mw.title.new, page_name)~~ if ~~not (success and title_object)~~parsed_subtags.script then local lower_script = parsed_subtags.script:lower() ~~mw.log(("Could not make title object for '%s'"):format(page_name))~~ if not lang_name_table.script[lower_script] then ~~return~~ mw.log("Invalid script code", parsed_subtags.script, "in", parsed_subtags:get_tag()) end -- Check that script tag is not marked as superfluous (because the -- it is considered the default one for the language). if lang_name_table.suppressed[lower_script] and parsed_subtags.language and m_table.inArray( lang_name_table.suppressed[lower_script], parsed_subtags.language:lower()) then mw.log(parsed_subtags.script, "is suppressed with", parsed_subtags.language, "in", parsed_subtags:get_tag()) end end -- Check existence of region code.. ~~local content = title_object:getContent()~~ if parsed_subtags.region and not lang_name_table.region[parsed_subtags.region:lower()] then mw.log("Invalid region code", parsed_subtags.region, "in", parsed_subtags:get_tag()) end -- Check that variant code is valid, and that it can validly be used with the ~~local language_codes = {}~~ -- given combination of language, script, region, and variant. ~~for lang_template in content:gmatch '{{lang[^}]+' do~~ -- Check for duplicate variant subtags? ~~local template_name = lang_template:match('{{([^\|}]+)')~~ if parsed_subtags.variant then ~~local language_code~~ local lower_tag = parsed_subtags:get_tag():lower() ~~if template_name == 'lang' then~~ ~~language_code = lang_template:match '{{lang\|([^\|}]+)'~~ for _, variant in ipairs(type(parsed_subtags.variant) == "table" ~~elseif template_name:find '^lang-' then~~ and parsed_subtags.variant or { parsed_subtags.variant }) do ~~language_code = lang_template:match '{{lang-([^\|}]+)'~~ if not lang_name_table.variant[variant] then ~~end~~ mw.log("Invalid variant code", variant, "in", parsed_subtags:get_tag()) ~~if language_code then~~ else ~~language_codes[language_code] = true~~ local prefix = parsed_subtags:get_tag():lower():match("^(.-)%-" .. variant) -- Check that at least one of the prefixes is found at the -- beginning of lower_tag. if not fun.some(function (prefix) return lower_tag:find(prefix, 1, true) == 1 end, lang_name_table.variant[variant].prefixes) then mw.log("Variant tag", variant, "does not belong with prefix", prefix, "in", parsed_subtags:get_tag()) end end end end -- Check that the private-use subtag is actually used by Wikipedia. ~~return table.concat(m_table.keysToList(language_codes), ', ')~~ if parsed_subtags.private_use and not lang_data.override[parsed_subtags.tag] then mw.log("Invalid private-use subtag in", parsed_subtags:get_tag()) end end function p.show_COinS(frame) local ref = frame.args[1] local tag = ref:match('<span [^>]class="Z3988"[^>]>') local data = tag:match('title="(.-)"') local vals = {} for item in mw.text.gsplit(data, "&") do local key, value = item:match("(.-)=(.)") vals[key] = mw.uri.decode(value) end return ref .. "\n\n" .. table.concat( require "Module:fun".mapIter( function (value, key) return ("%s: %s"):format(key, value) end, m_table.sortedPairs( vals)), ", ") end