Module:Sandbox/Erutuon: Difference between revisions

Content deleted Content added
place invalid tag suffix in "invalid" field, use "error" to indicate both that an error was present and the type of error
function to print data module for "default ignorable" property
 
(45 intermediate revisions by 2 users not shown)
Line 1:
local p = {}
 
function p.show(frame)
local Unicode_data = require 'Module:Unicode data/sandbox'
local page = frame.args[1] or "User:Erutuon/Unicode/DerivedCoreProperties.txt"
 
local text = assert(mw.title.new(page):getContent())
local function errorf(level, ...)
local defaultIgnorable = text
if type(level) == number then
:match("Derived Property: Default_Ignorable_Code_Point.-(%f[^\n]%x%x%x%x.-)%s*\n# Total code points")
return error(string.format(...), level + 1)
local singles, ranges = {}, {}
else -- level is actually the format string.
for codePoint1, codePoint2 in defaultIgnorable:gmatch("%f[^\n%z](%x+)%.?%.?(%x*)") do
return error(string.format(level, ...), 2)
codePoint1, codePoint2 = tonumber(codePoint1, 16), tonumber(codePoint2, 16)
end
local lastRange = ranges[#ranges]
end
if lastRange and lastRange[2] == codePoint1 - 1 then
 
lastRange[2] = codePoint2 or codePoint1
function mw.logf(...)
else
return mw.log(string.format(...))
if not codePoint2 then
end
singles[codePoint1] = true
 
else
local output_mt = {}
table.insert(ranges, { codePoint1, codePoint2 })
function output_mt:insert(str)
end
self.n = self.n + 1
end
self[self.n] = str
end
 
-- also in [[Module:Unicode data/documentation functions]]
function output_mt:insert_format(...)
self:insert(string.format(...))
end
 
output_mt.join = table.concat
 
output_mt.__index = output_mt
 
local function Output()
return setmetatable({ n = 0 }, output_mt)
end
 
 
local Latn_pattern = table.concat {
'[',
'\n\32-\127',
'\194\160-\194\172',
'\195\128-\195\191',
'\196\128-\197\191',
'\198\128-\201\143',
'\225\184\128-\225\187\191',
'\226\177\160-\226\177\191',
'\234\156\160-\234\159\191',
'\234\172\176-\234\173\175',
'\239\172\128-\239\172\134',
'\239\188\129-\239\188\188',
'–',
'—',
'«', '»',
']',
};
 
local get_codepoint = mw.ustring.codepoint
local function expand_range(start, ending)
local lower, higher = get_codepoint(start), get_codepoint(ending)
if higher < lower then
return nil
end
local chars = {}
local i = 0
for codepoint = lower, higher do
i = i + 1
chars[i] = mw.ustring.char(codepoint)
end
return table.concat(chars)
end
 
local fun = require "Module:Fun"
local m_table = require "Module:Table"
 
local script_to_count_mt = {
__index = function (self, key)
self[key] = 0
return 0
end,
__call = function (self, ...)
return setmetatable({}, self)
end
}
setmetatable(script_to_count_mt, script_to_count_mt)
 
-- Uses an iterator (such as mw.ustring.gcodepoint) that generates a codepoint
-- each time it is called with an optional state and another value.
local function show_scripts(iterator, state, value)
local script_to_count = script_to_count_mt()
for codepoint in iterator, state, value do
local script = Unicode_data.lookup_script(codepoint)
script_to_count[script] = script_to_count[script] + 1
end
return table.concat(
fun.mapIter(
function (count, script)
return ("%s (%d)"):format(script, count)
end,
m_table.sortedPairs(
script_to_count,
function (script1, script2)
return script_to_count[script1] > script_to_count[script2]
end)),
", ")
end
 
local function get_chars_in_scripts(iterator, state, value)
local script_to_char_set = {}
for codepoint in iterator, state, value do
local script = Unicode_data.lookup_script(codepoint)
script_to_char_set[script] = script_to_char_set[script] or {}
script_to_char_set[script][codepoint] = true
end
local template = [[
return script_to_char_set
local data = {}
end
 
data.defaultIgnorable = {
local function print_char_set_map(script_to_char_set, format, separator)
singles = {
format = format or "%s: %s"
...
separator = separator or "\n"
},
return table.concat(
fun.mapIter(
function (char_set, script)
local char_list = fun.mapIter(
function (_, codepoint)
return mw.ustring.char(codepoint)
end,
m_table.sortedPairs(char_set))
return (format):format(script, mw.text.nowiki(table.concat(char_list)))
end,
m_table.sortedPairs(script_to_char_set)),
separator)
end
 
function p.show(frame)
local expanded_pattern = Latn_pattern
:gsub('%[(.-)%]', '%1')
:gsub( -- Find two UTF-8-encoded characters separated by hyphen-minus.
'([%z\1-\127\194-\244][\128-\191]*)%-([%z\1-\127\194-\244][\128-\191]*)',
function (char1, char2)
return expand_range(char1, char2)
end)
ranges = {
return ('* <div style="overflow-wrap: break-word;">%s</div><br>%s')
...
:format(expanded_pattern
},
:gsub('^%s*', ''), -- Remove initial '\n ' to avoid creating unwanted pre element.
}
show_scripts(mw.ustring.gcodepoint(expanded_pattern)))
end
 
return data
local function get_block_info_from_arg(args, arg)
]]
local block_name = args[1]
or errorf("Parameter %s is required", tostring(arg))
local block_info = Unicode_data.get_block_info(block_name)
or errorf("The block '%s' could be found", block_name)
return block_info
end
 
local Array = require "Module:array"
local function get_boolean_from_arg(args, arg)
local printedRanges = Array()
return args[arg] and require "Module:Yesno" (args[arg])
for _, range in ipairs(ranges) do
end
local low, high, script_code = unpack(range)
 
printedRanges:insert(('\t\t{ 0x%05X, 0x%05X },'):format(low, high))
function p.scripts_in_block(frame)
local block_info = get_block_info_from_arg(frame.args, 1)
local show_block_name = get_boolean_from_arg(frame.args, 2)
local script_list = show_scripts(fun.range(block_info[1], block_info[2]))
if show_block_name then
return ("%s: %s"):format(block_info[3], script_list)
else
return script_list
end
end
 
local function link_block_name(block_name)
if block_name:find ' ' then
return ("[[%s]]"):format(block_name)
else
return ("[[%s (Unicode block)|%s]]"):format(block_name, block_name)
end
end
 
function p.scripts_in_blocks(frame)
local output = Output()
local start = frame.args[1] and tonumber(frame.args[1], 16) or 0
local ending = frame.args[2] and tonumber(frame.args[2], 16) or 0x4000
local printedSingles = Array()
local script_data = mw.loadData "Module:Unicode data/scripts"
for codepoint in require 'Module:TableTools'.sortedPairs(singles) do
local singles = script_data.singles
printedSingles:insert(('\t\t[0x%05X] = true,'):format(codepoint))
local ranges = script_data.ranges
local function clear (self)
for _, key in ipairs(m_table.keysToList(self, false)) do
self[key] = nil
end
end
local countsdata = {}template
:gsub('%.%.%.', printedSingles:concat('\n'), 1)
setmetatable(counts, {
:gsub('%.%.%.', printedRanges:concat('\n'), 1)
__index = {
increment = function(self, script_code, amount)
self[script_code] = (self[script_code] or 0) + (amount or 1)
end,
clear = clear,
}
})
local codepoints_per_script = {}
setmetatable(codepoints_per_script, {
__index = {
add = function(self, script_code, codepoint)
self[script_code] = self[script_code] or { n = 0 }
if self[script_code].n <= 0x20
and not (codepoint <= 0x9F and (codepoint >= 0x80
or codepoint <= 0x1F)) then
if self[script_code].n == 0x20 then
local period = ('.'):byte()
for _ = 1, 3 do
self[script_code].n = self[script_code].n + 1
self[script_code][self[script_code].n] = period
end
else
if script_code == "Zinh" then -- probably combining character
self[script_code].n = self[script_code].n + 1
self[script_code][self[script_code].n] = 0x25CC
end
self[script_code].n = self[script_code].n + 1
self[script_code][self[script_code].n] = codepoint
end
end
end,
clear = clear,
}
})
return data
output:insert [[
{| class="wikitable"
|+ Scripts in each Unicode block
! block !! codepoints !! scripts
]]
for _, block in pairs(mw.loadData "Module:Unicode data/blocks") do
local codepoint = block[1]
if codepoint > ending then break end
if codepoint >= start then
while codepoint <= block[2] do
local script = singles[codepoint]
local count
if script then -- Codepoint is in "singles" map.
counts:increment(script)
codepoints_per_script:add(script, codepoint)
codepoint = codepoint + 1
count = 1 -- for potential future use
else
local range, index = Unicode_data.binary_range_search(codepoint, ranges)
if range then -- Codepoint is in "ranges" array.
count = 0
script = range[3]
while codepoint <= range[2] and codepoint <= block[2] do
count = count + 1
codepoints_per_script:add(script, codepoint)
codepoint = codepoint + 1
end
counts:increment(script, count)
else -- Codepoint doesn't have data; it's Zzzz.
-- Get range immediately above codepoint.
while ranges[index][2] < codepoint do
index = index + 1
end
count = 0
script = "Zzzz"
local range = ranges[index]
while codepoint < range[1] and codepoint <= block[2]
and not singles[codepoint] do
count = count + 1
codepoint = codepoint + 1
end
counts:increment(script, count)
end
end
end
output:insert_format([[
|-
| %s
| U+%04X&ndash;U+%04X
| %s
]], link_block_name(block[3]), block[1], block[2],
table.concat(
fun.map(
function (count, script)
return ('<abbr title="%s">%s</abbr> (<span title="%s">%d</span>)')
:format(
script_data.aliases[script], script,
codepoints_per_script[script]
and mw.text.nowiki(mw.ustring.char(
unpack(codepoints_per_script[script])))
or "",
count)
end,
m_table.sortedPairs(
counts,
function (script1, script2)
return counts[script1] > counts[script2]
end)),
", "))
end
-- mw.logObject(codepoints_per_script, block[3])
counts:clear()
codepoints_per_script:clear()
end
output:insert "|}"
return output:join()
end
 
local Unicode_data = require "Module:Unicode data/sandbox"
function p.chars_in_scripts_in_block(frame)
local fun = require "Module:fun"
local block_info = get_block_info_from_arg(frame.args, 1)
local m_table = require "Module:TableTools"
local show_block_name = get_boolean_from_arg(frame.args, 2)
 
local script_char_set_map = print_char_set_map(
local function errorf(level, ...)
get_chars_in_scripts(fun.range(block_info[1], block_info[2])))
if show_block_nametype(level) == "number" then
return error("%s: %s"):string.format(block_info[3]...), script_char_set_maplevel + 1)
else -- level is actually the format string.
else
return script_char_set_maperror(string.format(level, ...), 2)
end
end
Line 340 ⟶ 80:
local language_codes = {}
for lang_template in content:gmatch '"{{lang[^}]+'" do
local template_name = lang_template:match('"{{([^|}]+)'")
local language_code
if template_name == '"lang'" then
language_code = lang_template:match '"{{lang|([^|}]+)'"
elseif template_name:find '"^lang-'" then
language_code = lang_template:match '"{{lang-([^|}]+)'"
end
if language_code then
Line 353 ⟶ 93:
end
return table.concat(m_table.keysToList(language_codes), '", '")
end
 
local parsed_subtags_mt = {
-- A previous draft, in [[Module:Lang/sandbox]].
__index = {
-- "error" is the error message.
-- "index" is the ordinal of the subtag in which the error was found.
throw = function (self, error, index)
self.error = self.error_messages[error]
self.invalid = table.concat(self.input, "-", index)
return self:remove_unnecessary_fields()
end,
remove_unnecessary_fields = function (self)
-- Only useful internally.
self.input = nil
self:pretty_print()
p.validate_lang_tag(self)
return self
end,
-- Regularize capitalization of language subtags:
-- ZH-LATN -> zh-Latn, FR-ca -> fr-CA
pretty_print = function (self)
for key, func in pairs(self.print_funcs) do
if self[key] then
self[key] = func(self[key])
end
end
return self
end,
-- Re-create the original tag from the parsed subtags.
get_tag = function (self)
if self.tag then return self.tag end
local tag = {}
for _, subtag_name in ipairs(self.subtag_order) do
if subtag_name == "private_use" then
table.insert(tag, "x")
end
if type(self[subtag_name]) == "table" then
for _, subtag in ipairs(self[subtag_name]) do
table.insert(tag, subtag)
end
else
table.insert(tag, self[subtag_name])
end
end
tag = table.concat(tag, "-")
self.tag = tag -- Cache the result.
return tag
end,
subtag_order = {
"language", "script", "region", "variant", "private_use"
},
error_messages = {
invalid_characters = "invalid characters",
no_language = "no language subtag",
invalid_subtag = "invalid subtag",
invalid_private_use = "length of private-use subtag out of range",
empty_private_use = "empty private-use subtag",
}
}
}
local function initial_caps_helper(initial, rest)
return string.upper(initial) .. string.lower(rest)
end
local function lower_or_map_lower(str)
if type(str) == "table" then
return fun.map(string.lower, str)
else
return string.lower(str)
end
end
parsed_subtags_mt.__index.print_funcs = {
language = string.lower,
script = function (script_code)
return (string.gsub(script_code, "^(%a)(%a%a%a)$", initial_caps_helper))
end,
region = string.upper,
variant = lower_or_map_lower,
private_use = lower_or_map_lower,
}
 
setmetatable(parsed_subtags_mt, {
__call = function (self, input)
return setmetatable({ input = input }, self)
end
})
-- An array of patterns for each subtag, and a "type" field for the name
-- of the subtag.
-- The patterns are checked in order, and any of the subtags can be skipped.
-- So, for example, the "language" subtag must precede the "script"
-- subtag, but a tag may contain a "language" subtag, no "script" subtag
-- and then a "region" subtag.
-- If the full list of subtags has been iterated over, the remaining subtags
-- must match the pattern for a private-use subtag, or the tag is invalid.
local subtag_info = { -- can be put in data module
{ "%a%a%a?", "1%a+", type = "language" }, -- ll or lll; special case
-- include extlang?
{ "%a%a%a%a", type = "script" }, -- Ssss
{ "%a%a", "%d%d%d", type = "region" }, -- rr, DDD
{
"%d%d%d%d", -- 4 digits
"%w%w%w%w%w%w?%w?%w?", -- 5-8 alnum characters
type = "variant",
repeatable = true, -- There can be multiple variants.
}
}
 
-- A previous draft, in [[Module:Lang/sandbox]]:
-- https://en.wikipedia.org/w/index.php?oldid=812819217
 
-- Based on https://www.w3.org/International/articles/language-tags/.
 
-- Parse a language tag.
-- Returns nil if tag is not a string or empty.
-- Else returns a table with a map of subtag type to subtag for all subtags that
-- were parsed.
-- If there was an error, returns an "error" field with a description of the
-- error, and an "invalid" field with the suffix of the tag starting at the
-- index where the error occurred.
 
-- Does not recognize "extension" tags, such as those introduced by "u", as they
-- are not needed on Wikipedia. Does not recognize "grandfathered" tags.
-- Does not recognize extended language subtags, such as "zh-yue".
-- https://www.rfc-editor.org/rfc/rfc6067.txt, https://tools.ietf.org/html/bcp47
 
-- Only checks that the syntax is correct, not that the values are valid. For
-- instance, will accept non-existent language codes, like "zz".
function p.parse_IETF(tag)
localif subtagstype(tag) ~= mw.text.split("string" or tag, == "-") then
return nil
end
-- This containsmay contain the special fields "matched_countinvalid" and, "invaliderror".
-- "matched_count" tracks the number of subtags, "error" indicates why the
-- tag is invalid (if applicable).
-- All other fields are subtags, and they appear in the tag in the following
-- order:
-- "language", "script", "region", "variant", "private_use", "invalid"
-- All these subtags can be strings or nil, while "variant" can also be an
-- array of strings if more than one variant subtag was found.
-- "invalid" is the portion of the tag after the last valid subtag (minus a
-- hyphen).
local segments = mw.text.split(tag, "-")
local parsed_subtags = { matched_count = 0 }
local parsed_subtags = parsed_subtags_mt(segments)
-- Language tags probably only contain ASCII alphabetic and numerical
-- An array of patterns for each subtag, and a "type" field for the name
-- ofcharacters theand subtaghyphen-minus.
if not tag:find "^[A-Za-z0-9-]+$" then
-- The patterns are checked in order, and any of the subtags can be skipped.
return parsed_subtags:throw(
-- So, for example, the "language" subtag must precede the "script"
"invalid_characters",
-- subtag, but a tag may contain a "language" subtag, no "script" subtag
fun.indexOf(
-- and then a "region" subtag.
function (tag)
-- If the full list of subtags has been iterated over, the remaining subtags
return tag:find "[^A-Za-z0-9-]"
-- must match the pattern for a private-use subtag, or the tag is invalid.
end,
local subtag_info = { -- can be put in data module
segments))
{ "%a%a%a?", "1%a+", type = "language" }, -- ll or lll; special case
end
-- include extlang?
{ "%a%a%a%a", type = "script" }, -- Ssss
{ "%a%a", "%d%d%d", type = "region" }, -- rr, DDD
{
"%d%d%d%d", -- 4 digits
"%w%w%w%w%w%w?%w?%w?", -- 5-8 alnum characters
type = "variant"
}
}
local subtag_i = 1 -- Index of current item in subtag_info.
local index = 1
local segment_i = 1 -- Index of current segment.
local last_matched_subtag_i = 0
while segments[segment_i] and subtag_info[subtag_i] do
for subtag_i, subtag in ipairs(subtags) do
local typesegment = segments[segment_i]
local matched = falsesubtag_type
while not matchedsubtag_type and subtag_info[subtag_i] do
-- Check each pattern for the subtag type at "indexsubtag_i" in "subtag_info".
forlocal _,cur_subtag pattern= in ipairs(subtag_info[indexsubtag_i]) do
iffor subtag:find("^" .._, pattern ..in "$"ipairs(cur_subtag) thendo
if segment:find("^" .. pattern .. "$") then
type = subtag_info[index].type
matchedsubtag_type = truecur_subtag.type
-- There can be multiple "variant" subtags (and "extension"
-- subtags, if those are added).
if not cur_subtag.repeatable then
subtag_i = subtag_i + 1
end
break
end
end
if not matched then -- Go to next item in subtag_info.
if not subtag_type then -- No match; try next subtag.
index = index + 1
subtag_i = subtag_i + 1
if not subtag_info[index] then
break
end
end
end
-- If language subtag has not been found, or the current segment has not
if type then
-- been matched as a subtag, break the loop and check for
parsed_subtags[type] = subtag
-- a private-use subtag.
last_matched_subtag_i = subtag_i
if segment_i == 1 and subtag_type ~= "language" or not subtag_type then
parsed_subtags.matched_count = parsed_subtags.matched_count + 1
elseif not subtag_info[index] then
break
else
if parsed_subtags[subtag_type] then -- Create an array.
if type(parsed_subtags[subtag_type]) == "string" then
parsed_subtags[subtag_type] = { parsed_subtags[subtag_type] }
end -- else table
table.insert(parsed_subtags[subtag_type], segment)
else
parsed_subtags[subtag_type] = segment
end
last_matched_segment_i = segment_i
end
segment_i = segment_i + 1
end
if segments[segment_i] then -- More segments to scan?
if #subtags > parsed_subtags.matched_count then
-- Not all potential subtags were matched. TheCheck unmatchedfor tailprivate-use end of the tagsubtags.
-- https://tools.ietf.org/html/bcp47#section-2.2.7
-- (after the subtag at the index last_matched_subtag_i) is a
-- privatePrivate-use subtagsubtags ifconsist itof startsone withor "x".more Otherwise,sequences theof tag1 isto 8
-- alphanumeric characters preceded by "x-".
-- invalid.
-- Alphanumericity has already been checked.
local suffix = table.concat(subtags, "-", last_matched_subtag_i + 1)
if subtags[last_matched_subtag_i + 1] == "x" then
-- A tag must start with either a language subtag or a private-use subtag.
parsed_subtags.private_use = suffix
-- If next segment is not "x", introducing a private-use subtag, there
parsed_subtags.matched_count = parsed_subtags.matched_count + 1
-- is no private-use subtag.
if segments[segment_i] and segments[segment_i]:lower() ~= "x" then
if not parsed_subtags.language then
return parsed_subtags:throw("no_language", 1)
else
return parsed_subtags:throw("invalid_subtag",
segment_i)
end
elseif not segments[segment_i + 1] then
return parsed_subtags:throw("empty_private_use",
segment_i)
end
-- Check length of all segments after "x".
for i = segment_i + 1, #segments do
local length = #segments[i]
if not (1 <= length and length <= 8) then
return parsed_subtags
:throw("invalid_private_use", segment_i)
end
end
if not segments[last_matched_segment_i + 3] then -- There is only one private-use subtag.
parsed_subtags.private_use = segments[segment_i + 1]
else
parsed_subtags.invalidprivate_use = suffix{}
for i = segment_i + 1, #segments do
parsed_subtags.error = "invalid subtag"
table.insert(parsed_subtags.private_use, segments[i])
end
end
end
if notreturn parsed_subtags.language then:remove_unnecessary_fields()
end
parsed_subtags.error = "no language"
 
 
local lang_name_table = mw.loadData "Module:Language/name/data"
local synonym_table = mw.loadData "Module:Lang/ISO 639 synonyms"
local lang_data = mw.loadData "Module:Lang/data"
 
function p.validate_lang_tag(parsed_subtags)
-- Already checked that the tag starts with a language subtag or a private-use subtag.
-- Script code is initially capitalized, region code is uppercase,
-- everything else is lowercase.
-- Check existence of language tag.
if parsed_subtags.language and
not (lang_data.override[parsed_subtags.language]
or lang_name_table.lang[parsed_subtags.language]) then
mw.log("Invalid language code", parsed_subtags.language, "in", parsed_subtags:get_tag())
end
-- Check existence of script tag.
return parsed_subtags
if parsed_subtags.script then
local lower_script = parsed_subtags.script:lower()
if not lang_name_table.script[lower_script] then
mw.log("Invalid script code", parsed_subtags.script, "in", parsed_subtags:get_tag())
end
-- Check that script tag is not marked as superfluous (because the
-- it is considered the default one for the language).
if lang_name_table.suppressed[lower_script]
and parsed_subtags.language
and m_table.inArray(
lang_name_table.suppressed[lower_script],
parsed_subtags.language:lower()) then
mw.log(parsed_subtags.script, "is suppressed with",
parsed_subtags.language, "in", parsed_subtags:get_tag())
end
end
-- Check existence of region code..
if parsed_subtags.region and not lang_name_table.region[parsed_subtags.region:lower()] then
mw.log("Invalid region code", parsed_subtags.region, "in", parsed_subtags:get_tag())
end
-- Check that variant code is valid, and that it can validly be used with the
-- given combination of language, script, region, and variant.
-- Check for duplicate variant subtags?
if parsed_subtags.variant then
local lower_tag = parsed_subtags:get_tag():lower()
for _, variant in ipairs(type(parsed_subtags.variant) == "table"
and parsed_subtags.variant or { parsed_subtags.variant }) do
if not lang_name_table.variant[variant] then
mw.log("Invalid variant code", variant, "in", parsed_subtags:get_tag())
else
local prefix = parsed_subtags:get_tag():lower():match("^(.-)%-" .. variant)
-- Check that at least one of the prefixes is found at the
-- beginning of lower_tag.
if not fun.some(function (prefix)
return lower_tag:find(prefix, 1, true) == 1
end,
lang_name_table.variant[variant].prefixes) then
mw.log("Variant tag", variant, "does not belong with prefix",
prefix, "in", parsed_subtags:get_tag())
end
end
end
end
-- Check that the private-use subtag is actually used by Wikipedia.
if parsed_subtags.private_use and not lang_data.override[parsed_subtags.tag] then
mw.log("Invalid private-use subtag in", parsed_subtags:get_tag())
end
end
 
function p.show_COinS(frame)
local ref = frame.args[1]
local tag = ref:match('<span [^>]*class="Z3988"[^>]*>')
local data = tag:match('title="(.-)"')
local vals = {}
for item in mw.text.gsplit(data, "&") do
local key, value = item:match("(.-)=(.*)")
vals[key] = mw.uri.decode(value)
end
return ref .. "\n\n" .. table.concat(
require "Module:fun".mapIter(
function (value, key)
return ("%s: %s"):format(key, value)
end,
m_table.sortedPairs(
vals)),
", ")
end