Module:Sandbox/Erutuon: Difference between revisions

Browse history interactively

← Previous edit

Content deleted Content added

Revision as of 21:41, 6 July 2018 edit Erutuon (talk \| contribs) Autopatrolled, Extended confirmed users 32,238 edits fix variant condition (I guess): search for valid prefix at beginning of string; fix suppressed script check ← Previous edit		Latest revision as of 09:39, 16 September 2019 edit undo Erutuon (talk \| contribs) Autopatrolled, Extended confirmed users 32,238 edits function to print data module for "default ignorable" property
(17 intermediate revisions by 2 users not shown)
Line 1: local p = {} function p.show(frame) ~~local Unicode_data = require "Module:Unicode data/sandbox"~~ local page = frame.args[1] or "User:Erutuon/Unicode/DerivedCoreProperties.txt" local text = assert(mw.title.new(page):getContent()) ~~local function errorf(level, ...)~~ local defaultIgnorable = text ~~if type(level) == number then~~ :match("Derived Property: Default_Ignorable_Code_Point.-(%f[^\n]%x%x%x%x.-)%s\n# Total code points") ~~return error(string.format(...), level + 1)~~ local singles, ranges = {}, {} ~~else -- level is actually the format string.~~ for codePoint1, codePoint2 in defaultIgnorable:gmatch("%f[^\n%z](%x+)%.?%.?(%x)") do ~~return error(string.format(level, ...), 2)~~ codePoint1, codePoint2 = tonumber(codePoint1, 16), tonumber(codePoint2, 16) ~~end~~ local lastRange = ranges[#ranges] ~~end~~ if lastRange and lastRange[2] == codePoint1 - 1 then lastRange[2] = codePoint2 or codePoint1 ~~function mw.logf(...)~~ else ~~return mw.log(string.format(...))~~ if not codePoint2 then ~~end~~ singles[codePoint1] = true else ~~local output_mt = {}~~ table.insert(ranges, { codePoint1, codePoint2 }) ~~function output_mt:insert(str)~~ end ~~self.n = self.n + 1~~ end ~~self[self.n] = str~~ ~~end~~ ~~-- also in [[Module:Unicode data/documentation functions]]~~ ~~function output_mt:insert_format(...)~~ ~~self:insert(string.format(...))~~ ~~end~~ ~~output_mt.join = table.concat~~ ~~output_mt.__index = output_mt~~ ~~local function Output()~~ ~~return setmetatable({ n = 0 }, output_mt)~~ ~~end~~ ~~local Latn_pattern = table.concat {~~ ~~"[",~~ ~~"\n\32-\127",~~ ~~"\194\160-\194\172",~~ ~~"\195\128-\195\191",~~ ~~"\196\128-\197\191",~~ ~~"\198\128-\201\143",~~ ~~"\225\184\128-\225\187\191",~~ ~~"\226\177\160-\226\177\191",~~ ~~"\234\156\160-\234\159\191",~~ ~~"\234\172\176-\234\173\175",~~ ~~"\239\172\128-\239\172\134",~~ ~~"\239\188\129-\239\188\188",~~ ~~"–",~~ ~~"—",~~ ~~"«", "»",~~ ~~"]",~~ }; ~~local get_codepoint = mw.ustring.codepoint~~ ~~local function expand_range(start, ending)~~ ~~local lower, higher = get_codepoint(start), get_codepoint(ending)~~ ~~if higher < lower then~~ ~~return nil~~ ~~end~~ ~~local chars = {}~~ ~~local i = 0~~ ~~for codepoint = lower, higher do~~ ~~i = i + 1~~ ~~chars[i] = mw.ustring.char(codepoint)~~ ~~end~~ ~~return table.concat(chars)~~ ~~end~~ ~~local fun = require "Module:Fun"~~ ~~local m_table = require "Module:Table"~~ ~~local script_to_count_mt = {~~ ~~__index = function (self, key)~~ ~~self[key] = 0~~ ~~return 0~~ ~~end,~~ ~~__call = function (self, ...)~~ ~~return setmetatable({}, self)~~ ~~end~~ } ~~setmetatable(script_to_count_mt, script_to_count_mt)~~ ~~-- Uses an iterator (such as mw.ustring.gcodepoint) that generates a codepoint~~ ~~-- each time it is called with an optional state and another value.~~ ~~local function show_scripts(iterator, state, value)~~ ~~local script_to_count = script_to_count_mt()~~ ~~for codepoint in iterator, state, value do~~ ~~local script = Unicode_data.lookup_script(codepoint)~~ ~~script_to_count[script] = script_to_count[script] + 1~~ ~~end~~ ~~return table.concat(~~ ~~fun.mapIter(~~ ~~function (count, script)~~ ~~return ("%s (%d)"):format(script, count)~~ ~~end,~~ ~~m_table.sortedPairs(~~ ~~script_to_count,~~ ~~function (script1, script2)~~ ~~return script_to_count[script1] > script_to_count[script2]~~ ~~end)),~~ ~~", ")~~ ~~end~~ ~~local function get_chars_in_scripts(iterator, state, value)~~ ~~local script_to_char_set = {}~~ ~~for codepoint in iterator, state, value do~~ ~~local script = Unicode_data.lookup_script(codepoint)~~ ~~script_to_char_set[script] = script_to_char_set[script] or {}~~ ~~script_to_char_set[script][codepoint] = true~~ end local template = [[ ~~return script_to_char_set~~ local data = {} ~~end~~ data.defaultIgnorable = { ~~local function print_char_set_map(script_to_char_set, format, separator)~~ singles = { ~~format = format or "%s: %s"~~ ... ~~separator = separator or "\n"~~ }, ~~return table.concat(~~ ~~fun.mapIter(~~ ~~function (char_set, script)~~ ~~local char_list = fun.mapIter(~~ ~~function (_, codepoint)~~ ~~return mw.ustring.char(codepoint)~~ ~~end,~~ ~~m_table.sortedPairs(char_set))~~ ~~return (format):format(script, mw.text.nowiki(table.concat(char_list)))~~ ~~end,~~ ~~m_table.sortedPairs(script_to_char_set)),~~ ~~separator)~~ ~~end~~ ~~function p.show(frame)~~ ~~local expanded_pattern = Latn_pattern~~ ~~:gsub("%[(.-)%]", "%1")~~ ~~:gsub( -- Find two UTF-8-encoded characters separated by hyphen-minus.~~ ~~"([%z\1-\127\194-\244][\128-\191])%-([%z\1-\127\194-\244][\128-\191])",~~ ~~function (char1, char2)~~ ~~return expand_range(char1, char2)~~ ~~end)~~ ranges = { ~~return ('* <div style="overflow-wrap: break-word;">%s</div><br>%s')~~ ... ~~:format(expanded_pattern~~ }, ~~:gsub("^%s", ""), -- Remove initial "\n " to avoid creating unwanted pre element.~~ } ~~show_scripts(mw.ustring.gcodepoint(expanded_pattern)))~~ ~~end~~ return data ~~local function get_block_info_from_arg(args, arg)~~ ]] ~~local block_name = args[1]~~ ~~or errorf("Parameter %s is required", tostring(arg))~~ ~~local block_info = Unicode_data.get_block_info(block_name)~~ ~~or errorf("The block '%s' could be found", block_name)~~ ~~return block_info~~ ~~end~~ local Array = require "Module:array" ~~local function get_boolean_from_arg(args, arg)~~ local printedRanges = Array() ~~return args[arg] and require "Module:Yesno" (args[arg])~~ for _, range in ipairs(ranges) do ~~end~~ local low, high, script_code = unpack(range) printedRanges:insert(('\t\t{ 0x%05X, 0x%05X },'):format(low, high)) ~~function p.scripts_in_block(frame)~~ ~~local block_info = get_block_info_from_arg(frame.args, 1)~~ ~~local show_block_name = get_boolean_from_arg(frame.args, 2)~~ ~~local script_list = show_scripts(fun.range(block_info[1], block_info[2]))~~ ~~if show_block_name then~~ ~~return ("%s: %s"):format(block_info[3], script_list)~~ ~~else~~ ~~return script_list~~ end ~~end~~ ~~local function link_block_name(block_name)~~ ~~if block_name:find " " then~~ ~~return ("[[%s]]"):format(block_name)~~ ~~else~~ ~~return ("[[%s (Unicode block)\|%s]]"):format(block_name, block_name)~~ ~~end~~ ~~end~~ ~~function p.scripts_in_blocks(frame)~~ ~~local output = Output()~~ ~~local start = frame.args[1] and tonumber(frame.args[1], 16) or 0~~ ~~local ending = frame.args[2] and tonumber(frame.args[2], 16) or 0x4000~~ local printedSingles = Array() ~~local script_data = mw.loadData "Module:Unicode data/scripts"~~ for codepoint in require 'Module:TableTools'.sortedPairs(singles) do ~~local singles = script_data.singles~~ printedSingles:insert(('\t\t[0x%05X] = true,'):format(codepoint)) ~~local ranges = script_data.ranges~~ ~~local function clear (self)~~ ~~for _, key in ipairs(m_table.keysToList(self, false)) do~~ ~~self[key] = nil~~ ~~end~~ end local ~~counts~~data = {}template :gsub('%.%.%.', printedSingles:concat('\n'), 1) ~~setmetatable(counts, {~~ :gsub('%.%.%.', printedRanges:concat('\n'), 1) ~~__index = {~~ ~~increment = function(self, script_code, amount)~~ ~~self[script_code] = (self[script_code] or 0) + (amount or 1)~~ ~~end,~~ ~~clear = clear,~~ } }) ~~local codepoints_per_script = {}~~ ~~setmetatable(codepoints_per_script, {~~ ~~__index = {~~ ~~add = function(self, script_code, codepoint)~~ ~~self[script_code] = self[script_code] or { n = 0 }~~ ~~if self[script_code].n <= 0x20~~ ~~and not (codepoint <= 0x9F and (codepoint >= 0x80~~ ~~or codepoint <= 0x1F)) then~~ ~~if self[script_code].n == 0x20 then~~ ~~local period = ("."):byte()~~ ~~for _ = 1, 3 do~~ ~~self[script_code].n = self[script_code].n + 1~~ ~~self[script_code][self[script_code].n] = period~~ ~~end~~ ~~else~~ ~~if script_code == "Zinh" then -- probably combining character~~ ~~self[script_code].n = self[script_code].n + 1~~ ~~self[script_code][self[script_code].n] = 0x25CC~~ ~~end~~ ~~self[script_code].n = self[script_code].n + 1~~ ~~self[script_code][self[script_code].n] = codepoint~~ ~~end~~ ~~end~~ ~~end,~~ ~~clear = clear,~~ } }) ~~output:insert [[~~ ~~{\| class="wikitable"~~ ~~\|+ Scripts in each Unicode block~~ ~~! block !! codepoints !! scripts~~ ]] ~~for _, block in pairs(mw.loadData "Module:Unicode data/blocks") do~~ ~~local codepoint = block[1]~~ ~~if codepoint > ending then break end~~ ~~if codepoint >= start then~~ ~~while codepoint <= block[2] do~~ ~~local script = singles[codepoint]~~ ~~local count~~ ~~if script then -- Codepoint is in "singles" map.~~ ~~counts:increment(script)~~ ~~codepoints_per_script:add(script, codepoint)~~ ~~codepoint = codepoint + 1~~ ~~count = 1 -- for potential future use~~ ~~else~~ ~~local range, index = Unicode_data.binary_range_search(codepoint, ranges)~~ ~~if range then -- Codepoint is in "ranges" array.~~ ~~count = 0~~ ~~script = range[3]~~ ~~while codepoint <= range[2] and codepoint <= block[2] do~~ ~~count = count + 1~~ ~~codepoints_per_script:add(script, codepoint)~~ ~~codepoint = codepoint + 1~~ ~~end~~ ~~counts:increment(script, count)~~ ~~else -- Codepoint doesn't have data; it's Zzzz.~~ ~~-- Get range immediately above codepoint.~~ ~~while ranges[index][2] < codepoint do~~ ~~index = index + 1~~ ~~end~~ ~~count = 0~~ ~~script = "Zzzz"~~ ~~local range = ranges[index]~~ ~~while codepoint < range[1] and codepoint <= block[2]~~ ~~and not singles[codepoint] do~~ ~~count = count + 1~~ ~~codepoint = codepoint + 1~~ ~~end~~ ~~counts:increment(script, count)~~ ~~end~~ ~~end~~ ~~end~~ ~~output:insert_format([[~~ \|- ~~\| %s~~ ~~\| U+%04X–U+%04X~~ ~~\| %s~~ ~~]], link_block_name(block[3]), block[1], block[2],~~ ~~table.concat(~~ ~~fun.map(~~ ~~function (count, script)~~ ~~return ('<abbr title="%s">%s</abbr> (<span title="%s">%d</span>)')~~ ~~:format(~~ ~~script_data.aliases[script], script,~~ ~~codepoints_per_script[script]~~ ~~and mw.text.nowiki(mw.ustring.char(~~ ~~unpack(codepoints_per_script[script])))~~ ~~or "",~~ ~~count)~~ ~~end,~~ ~~m_table.sortedPairs(~~ ~~counts,~~ ~~function (script1, script2)~~ ~~return counts[script1] > counts[script2]~~ ~~end)),~~ ~~", "))~~ ~~end~~ ~~-- mw.logObject(codepoints_per_script, block[3])~~ ~~counts:clear()~~ ~~codepoints_per_script:clear()~~ ~~end~~ ~~output:insert "\|}"~~ return ~~output:join()~~data end local Unicode_data = require "Module:Unicode data/sandbox" ~~function p.chars_in_scripts_in_block(frame)~~ local fun = require "Module:fun" ~~local block_info = get_block_info_from_arg(frame.args, 1)~~ local m_table = require "Module:TableTools" ~~local show_block_name = get_boolean_from_arg(frame.args, 2)~~ ~~local script_char_set_map = print_char_set_map(~~ local function errorf(level, ...) ~~get_chars_in_scripts(fun.range(block_info[1], block_info[2])))~~ if ~~show_block_name~~type(level) == "number" then return error(~~"%s: %s"):~~string.format(~~block_info[3]~~...), ~~script_char_set_map~~level + 1) else -- level is actually the format string. ~~else~~ return ~~script_char_set_map~~error(string.format(level, ...), 2) end end Line 374 ⟶ 114: end, -- Regularize capitalization of language subtags: -- ZH-LATN -> zh-Latn, FR-ca -> fr-CA pretty_print = function (self) for key, func in pairs(self.print_funcs) do Line 383 ⟶ 125: end, -- Re-create the original tag from the parsed subtags. get_tag = function (self) if self.tag then return self.tag end Line 388 ⟶ 131: local tag = {} for _, subtag_name in ipairs(self.subtag_order) do ~~table.insert(tag,~~if ~~self[~~subtag_name]) == "private_use" then table.insert(tag, "x") end if type(self[subtag_name]) == "table" then for _, subtag in ipairs(self[subtag_name]) do table.insert(tag, subtag) end else table.insert(tag, self[subtag_name]) end end tag = table.concat(tag, "-") self.tag = tag -- Cache the result. return tag end, Line 434 ⟶ 189: }) -- An array of patterns for each subtag, and a "type" field for the name -- of the subtag. -- The patterns are checked in order, and any of the subtags can be skipped. -- So, for example, the "language" subtag must precede the "script" -- subtag, but a tag may contain a "language" subtag, no "script" subtag -- and then a "region" subtag. -- If the full list of subtags has been iterated over, the remaining subtags -- must match the pattern for a private-use subtag, or the tag is invalid. local subtag_info = { -- can be put in data module { "%a%a%a?", "1%a+", type = "language" }, -- ll or lll; special case -- include extlang? { "%a%a%a%a", type = "script" }, -- Ssss { "%a%a", "%d%d%d", type = "region" }, -- rr, DDD { "%d%d%d%d", -- 4 digits "%w%w%w%w%w%w?%w?%w?", -- 5-8 alnum characters type = "variant", repeatable = true, -- There can be multiple variants. } } -- A previous draft, in [[Module:Lang/sandbox]]: Line 470 ⟶ 245: -- "invalid" is the portion of the tag after the last valid subtag (minus a -- hyphen). local ~~potential_subtags~~segments = mw.text.split(tag, "-") local parsed_subtags = parsed_subtags_mt(~~potential_subtags~~segments) ~~parsed_subtags.tag = tag~~ -- Language tags probably only contain ASCII alphabetic and numerical Line 483 ⟶ 257: return tag:find "[^A-Za-z0-9-]" end, ~~potential_subtags~~segments)) end local subtag_i = 1 -- Index of current item in subtag_info. ~~-- An array of patterns for each subtag, and a "type" field for the name~~ local segment_i = 1 -- Index of current segment. ~~-- of the subtag.~~ while segments[segment_i] and subtag_info[subtag_i] do ~~-- The patterns are checked in order, and any of the subtags can be skipped.~~ local segment = segments[segment_i] ~~-- So, for example, the "language" subtag must precede the "script"~~ ~~-- subtag, but a tag may contain a "language" subtag, no "script" subtag~~ ~~-- and then a "region" subtag.~~ ~~-- If the full list of subtags has been iterated over, the remaining subtags~~ ~~-- must match the pattern for a private-use subtag, or the tag is invalid.~~ ~~local subtag_info = { -- can be put in data module~~ ~~{ "%a%a%a?", "1%a+", type = "language" }, -- ll or lll; special case~~ ~~-- include extlang?~~ ~~{ "%a%a%a%a", type = "script" }, -- Ssss~~ ~~{ "%a%a", "%d%d%d", type = "region" }, -- rr, DDD~~ { ~~"%d%d%d%d", -- 4 digits~~ ~~"%w%w%w%w%w%w?%w?%w?", -- 5-8 alnum characters~~ ~~type = "variant"~~ } } ~~local index = 1~~ ~~local last_matched_subtag_i = 0~~ ~~for subtag_i, subtag in ipairs(potential_subtags) do~~ local subtag_type while not subtag_type and subtag_info[subtag_i] do ~~local matched = false~~ -- Check each pattern for the subtag type at "subtag_i" in "subtag_info". ~~while not matched do~~ local cur_subtag = subtag_info[subtag_i] ~~-- Check each pattern for the subtag type at "index" in "subtag_info".~~ for _, pattern in ipairs(~~subtag_info[index]~~cur_subtag) do if ~~subtag~~segment:find("^" .. pattern .. "$") then subtag_type = ~~subtag_info[index]~~cur_subtag.type ~~matched = true~~ -- There can be multiple "variant" subtags (and "extension" -- subtags, if those are added). if ~~subtag_type~~not ~~~= "variant"~~cur_subtag.repeatable then ~~index~~subtag_i = ~~index~~subtag_i + 1 end break end end ~~if not matched then -- Go to next item in subtag_info.~~ if not subtag_type then -- No match; try next subtag. ~~index = index + 1~~ subtag_i = subtag_i + 1 ~~if not subtag_info[index] then~~ ~~break~~ ~~end~~ end end -- If language subtag has not been found, or the current segment has not ~~if subtag_i == 1 and subtag_type ~= "language" then~~ -- been matched as a subtag, break the loop and check for -- a private-use subtag. if segment_i == 1 and subtag_type ~= "language" or not subtag_type then break else ~~elseif subtag_type then~~ if parsed_subtags[subtag_type] then -- Create an array. if type(parsed_subtags[subtag_type]) == "string" then parsed_subtags[subtag_type] = { parsed_subtags[subtag_type] } end -- else table table.insert(parsed_subtags[subtag_type], ~~subtag~~segment) else parsed_subtags[subtag_type] = ~~subtag~~segment end last_matched_segment_i = segment_i ~~last_matched_subtag_i = subtag_i~~ ~~elseif not subtag_info[index] then~~ ~~break~~ end segment_i = segment_i + 1 end if segments[segment_i] then -- More segments to scan? ~~if last_matched_subtag_i < #potential_subtags then~~ -- Not all potential subtags were matched. Check for private-use subtags. -- https://tools.ietf.org/html/bcp47#section-2.2.7 Line 557 ⟶ 311: -- alphanumeric characters preceded by "x-". -- Alphanumericity has already been checked. ~~if potential_subtags[last_matched_subtag_i + 1]~~ -- A tag must start with either a language subtag or a private-use subtag. ~~and potential_subtags[last_matched_subtag_i + 1]:lower() ~= "x" then~~ -- If next segment is not "x", introducing a private-use subtag, there -- is no private-use subtag. if segments[segment_i] and segments[segment_i]:lower() ~= "x" then if not parsed_subtags.language then return parsed_subtags:throw("no_language", 1) else return parsed_subtags:throw("invalid_subtag", segment_i) ~~last_matched_subtag_i + 1)~~ end elseif not segments[segment_i + 1] then ~~end~~ ~~-- Check length of all following subtags.~~ ~~if not potential_subtags[last_matched_subtag_i + 2] then~~ return parsed_subtags:throw("empty_private_use", segment_i) ~~last_matched_subtag_i + 1)~~ end -- Check length of all segments after "x". ~~for i = last_matched_subtag_i + 2, #potential_subtags do~~ for i = segment_i + 1, #segments do ~~local length = #potential_subtags[i]~~ local length = #segments[i] if not (1 <= length and length <= 8) then return parsed_subtags :throw("invalid_private_use", segment_i) ~~last_matched_subtag_i + 1)~~ end end if not ~~potential_subtags~~segments[~~last_matched_subtag_i~~last_matched_segment_i + 3] then -- There is only one private-use subtag. parsed_subtags.private_use = ~~potential_subtags~~segments[~~last_matched_subtag_i~~segment_i + 21] else parsed_subtags.private_use = {} for i = ~~last_matched_subtag_i~~segment_i + 21, #~~potential_subtags~~segments do table.insert(parsed_subtags.private_use, ~~potential_subtags~~segments[i]) end end Line 606 ⟶ 360: -- everything else is lowercase. -- Check existence of language tag. if parsed_subtags.language and not (lang_data.override[parsed_subtags.language] Line 612 ⟶ 367: end -- Check existence of script tag. if parsed_subtags.script then iflocal ~~not~~lower_script = ~~lang_name_table.script[~~parsed_subtags.script:lower()~~] then~~ if not lang_name_table.script[lower_script] then mw.log("Invalid script code", parsed_subtags.script, "in", parsed_subtags:get_tag()) end -- Check that script tag is not marked as superfluous (because the ~~local lower_script = parsed_subtags.script:lower()~~ -- it is considered the default one for the language). if lang_name_table.suppressed[lower_script] and parsed_subtags.language and m_table.inArray( lang_name_table.suppressed[lower_script], parsed_subtags.language:lower()) then mw.log(parsed_subtags.script, "is suppressed with", parsed_subtags.language, "in", parsed_subtags:get_tag()) end end -- Check existence of region code.. if parsed_subtags.region and not lang_name_table.region[parsed_subtags.region:lower()] then mw.log("Invalid region code", parsed_subtags.region, "in", parsed_subtags:get_tag()) end -- Check that variant code is valid, and that it can validly be used with the -- given combination of language, script, region, and variant. -- Check for duplicate variant subtags? if parsed_subtags.variant then local lower_tag = parsed_subtags:get_tag():lower() Line 638 ⟶ 402: mw.log("Invalid variant code", variant, "in", parsed_subtags:get_tag()) else local prefix = parsed_subtags:get_tag():lower():match("^(.-)%-" .. variant) -- Check that at least one of the prefixes is found at the -- beginning of lower_tag. if not fun.some(function (prefix) return lower_tag:find(prefix, 1, true) == 1 ~~end,~~ end, lang_name_table.variant[variant].prefixes) then mw.log("Variant tag", variant, "does not belong with prefix", prefix, "in", parsed_subtags:get_tag()) end end Line 650 ⟶ 417: end -- Check that the private-use subtag is actually used by Wikipedia. if parsed_subtags.private_use and not lang_data.override[parsed_subtags.tag] then mw.log("Invalid private-use subtag", ~~parsed_subtags.private_use, "~~in", parsed_subtags:get_tag()) end end function p.show_COinS(frame) local ref = frame.args[1] local tag = ref:match('<span [^>]class="Z3988"[^>]>') local data = tag:match('title="(.-)"') local vals = {} for item in mw.text.gsplit(data, "&") do local key, value = item:match("(.-)=(.)") vals[key] = mw.uri.decode(value) end return ref .. "\n\n" .. table.concat( require "Module:fun".mapIter( function (value, key) return ("%s: %s"):format(key, value) end, m_table.sortedPairs( vals)), ", ") end