Module:Sandbox/Erutuon: Difference between revisions

Content deleted Content added
place variant tags in array if there is more than one
function to print data module for "default ignorable" property
 
(34 intermediate revisions by 2 users not shown)
Line 1:
local p = {}
 
function p.show(frame)
local Unicode_data = require 'Module:Unicode data/sandbox'
local page = frame.args[1] or "User:Erutuon/Unicode/DerivedCoreProperties.txt"
 
local text = assert(mw.title.new(page):getContent())
local function errorf(level, ...)
local defaultIgnorable = text
if type(level) == number then
:match("Derived Property: Default_Ignorable_Code_Point.-(%f[^\n]%x%x%x%x.-)%s*\n# Total code points")
return error(string.format(...), level + 1)
local singles, ranges = {}, {}
else -- level is actually the format string.
for codePoint1, codePoint2 in defaultIgnorable:gmatch("%f[^\n%z](%x+)%.?%.?(%x*)") do
return error(string.format(level, ...), 2)
codePoint1, codePoint2 = tonumber(codePoint1, 16), tonumber(codePoint2, 16)
end
local lastRange = ranges[#ranges]
end
if lastRange and lastRange[2] == codePoint1 - 1 then
 
lastRange[2] = codePoint2 or codePoint1
function mw.logf(...)
else
return mw.log(string.format(...))
if not codePoint2 then
end
singles[codePoint1] = true
 
else
local output_mt = {}
table.insert(ranges, { codePoint1, codePoint2 })
function output_mt:insert(str)
end
self.n = self.n + 1
end
self[self.n] = str
end
 
-- also in [[Module:Unicode data/documentation functions]]
function output_mt:insert_format(...)
self:insert(string.format(...))
end
 
output_mt.join = table.concat
 
output_mt.__index = output_mt
 
local function Output()
return setmetatable({ n = 0 }, output_mt)
end
 
 
local Latn_pattern = table.concat {
'[',
'\n\32-\127',
'\194\160-\194\172',
'\195\128-\195\191',
'\196\128-\197\191',
'\198\128-\201\143',
'\225\184\128-\225\187\191',
'\226\177\160-\226\177\191',
'\234\156\160-\234\159\191',
'\234\172\176-\234\173\175',
'\239\172\128-\239\172\134',
'\239\188\129-\239\188\188',
'–',
'—',
'«', '»',
']',
};
 
local get_codepoint = mw.ustring.codepoint
local function expand_range(start, ending)
local lower, higher = get_codepoint(start), get_codepoint(ending)
if higher < lower then
return nil
end
local chars = {}
local i = 0
for codepoint = lower, higher do
i = i + 1
chars[i] = mw.ustring.char(codepoint)
end
return table.concat(chars)
end
 
local fun = require "Module:Fun"
local m_table = require "Module:Table"
 
local script_to_count_mt = {
__index = function (self, key)
self[key] = 0
return 0
end,
__call = function (self, ...)
return setmetatable({}, self)
end
}
setmetatable(script_to_count_mt, script_to_count_mt)
 
-- Uses an iterator (such as mw.ustring.gcodepoint) that generates a codepoint
-- each time it is called with an optional state and another value.
local function show_scripts(iterator, state, value)
local script_to_count = script_to_count_mt()
for codepoint in iterator, state, value do
local script = Unicode_data.lookup_script(codepoint)
script_to_count[script] = script_to_count[script] + 1
end
return table.concat(
fun.mapIter(
function (count, script)
return ("%s (%d)"):format(script, count)
end,
m_table.sortedPairs(
script_to_count,
function (script1, script2)
return script_to_count[script1] > script_to_count[script2]
end)),
", ")
end
 
local function get_chars_in_scripts(iterator, state, value)
local script_to_char_set = {}
for codepoint in iterator, state, value do
local script = Unicode_data.lookup_script(codepoint)
script_to_char_set[script] = script_to_char_set[script] or {}
script_to_char_set[script][codepoint] = true
end
local template = [[
return script_to_char_set
local data = {}
end
 
data.defaultIgnorable = {
local function print_char_set_map(script_to_char_set, format, separator)
singles = {
format = format or "%s: %s"
...
separator = separator or "\n"
},
return table.concat(
fun.mapIter(
function (char_set, script)
local char_list = fun.mapIter(
function (_, codepoint)
return mw.ustring.char(codepoint)
end,
m_table.sortedPairs(char_set))
return (format):format(script, mw.text.nowiki(table.concat(char_list)))
end,
m_table.sortedPairs(script_to_char_set)),
separator)
end
 
function p.show(frame)
local expanded_pattern = Latn_pattern
:gsub('%[(.-)%]', '%1')
:gsub( -- Find two UTF-8-encoded characters separated by hyphen-minus.
'([%z\1-\127\194-\244][\128-\191]*)%-([%z\1-\127\194-\244][\128-\191]*)',
function (char1, char2)
return expand_range(char1, char2)
end)
ranges = {
return ('* <div style="overflow-wrap: break-word;">%s</div><br>%s')
...
:format(expanded_pattern
},
:gsub('^%s*', ''), -- Remove initial '\n ' to avoid creating unwanted pre element.
}
show_scripts(mw.ustring.gcodepoint(expanded_pattern)))
end
 
return data
local function get_block_info_from_arg(args, arg)
]]
local block_name = args[1]
or errorf("Parameter %s is required", tostring(arg))
local block_info = Unicode_data.get_block_info(block_name)
or errorf("The block '%s' could be found", block_name)
return block_info
end
 
local Array = require "Module:array"
local function get_boolean_from_arg(args, arg)
local printedRanges = Array()
return args[arg] and require "Module:Yesno" (args[arg])
for _, range in ipairs(ranges) do
end
local low, high, script_code = unpack(range)
 
printedRanges:insert(('\t\t{ 0x%05X, 0x%05X },'):format(low, high))
function p.scripts_in_block(frame)
local block_info = get_block_info_from_arg(frame.args, 1)
local show_block_name = get_boolean_from_arg(frame.args, 2)
local script_list = show_scripts(fun.range(block_info[1], block_info[2]))
if show_block_name then
return ("%s: %s"):format(block_info[3], script_list)
else
return script_list
end
end
 
local function link_block_name(block_name)
if block_name:find ' ' then
return ("[[%s]]"):format(block_name)
else
return ("[[%s (Unicode block)|%s]]"):format(block_name, block_name)
end
end
 
function p.scripts_in_blocks(frame)
local output = Output()
local start = frame.args[1] and tonumber(frame.args[1], 16) or 0
local ending = frame.args[2] and tonumber(frame.args[2], 16) or 0x4000
local printedSingles = Array()
local script_data = mw.loadData "Module:Unicode data/scripts"
for codepoint in require 'Module:TableTools'.sortedPairs(singles) do
local singles = script_data.singles
printedSingles:insert(('\t\t[0x%05X] = true,'):format(codepoint))
local ranges = script_data.ranges
local function clear (self)
for _, key in ipairs(m_table.keysToList(self, false)) do
self[key] = nil
end
end
local countsdata = {}template
:gsub('%.%.%.', printedSingles:concat('\n'), 1)
setmetatable(counts, {
:gsub('%.%.%.', printedRanges:concat('\n'), 1)
__index = {
increment = function(self, script_code, amount)
self[script_code] = (self[script_code] or 0) + (amount or 1)
end,
clear = clear,
}
})
local codepoints_per_script = {}
setmetatable(codepoints_per_script, {
__index = {
add = function(self, script_code, codepoint)
self[script_code] = self[script_code] or { n = 0 }
if self[script_code].n <= 0x20
and not (codepoint <= 0x9F and (codepoint >= 0x80
or codepoint <= 0x1F)) then
if self[script_code].n == 0x20 then
local period = ('.'):byte()
for _ = 1, 3 do
self[script_code].n = self[script_code].n + 1
self[script_code][self[script_code].n] = period
end
else
if script_code == "Zinh" then -- probably combining character
self[script_code].n = self[script_code].n + 1
self[script_code][self[script_code].n] = 0x25CC
end
self[script_code].n = self[script_code].n + 1
self[script_code][self[script_code].n] = codepoint
end
end
end,
clear = clear,
}
})
output:insert [[
{| class="wikitable"
|+ Scripts in each Unicode block
! block !! codepoints !! scripts
]]
for _, block in pairs(mw.loadData "Module:Unicode data/blocks") do
local codepoint = block[1]
if codepoint > ending then break end
if codepoint >= start then
while codepoint <= block[2] do
local script = singles[codepoint]
local count
if script then -- Codepoint is in "singles" map.
counts:increment(script)
codepoints_per_script:add(script, codepoint)
codepoint = codepoint + 1
count = 1 -- for potential future use
else
local range, index = Unicode_data.binary_range_search(codepoint, ranges)
if range then -- Codepoint is in "ranges" array.
count = 0
script = range[3]
while codepoint <= range[2] and codepoint <= block[2] do
count = count + 1
codepoints_per_script:add(script, codepoint)
codepoint = codepoint + 1
end
counts:increment(script, count)
else -- Codepoint doesn't have data; it's Zzzz.
-- Get range immediately above codepoint.
while ranges[index][2] < codepoint do
index = index + 1
end
count = 0
script = "Zzzz"
local range = ranges[index]
while codepoint < range[1] and codepoint <= block[2]
and not singles[codepoint] do
count = count + 1
codepoint = codepoint + 1
end
counts:increment(script, count)
end
end
end
output:insert_format([[
|-
| %s
| U+%04X&ndash;U+%04X
| %s
]], link_block_name(block[3]), block[1], block[2],
table.concat(
fun.map(
function (count, script)
return ('<abbr title="%s">%s</abbr> (<span title="%s">%d</span>)')
:format(
script_data.aliases[script], script,
codepoints_per_script[script]
and mw.text.nowiki(mw.ustring.char(
unpack(codepoints_per_script[script])))
or "",
count)
end,
m_table.sortedPairs(
counts,
function (script1, script2)
return counts[script1] > counts[script2]
end)),
", "))
end
-- mw.logObject(codepoints_per_script, block[3])
counts:clear()
codepoints_per_script:clear()
end
output:insert "|}"
return output:join()data
end
 
local Unicode_data = require "Module:Unicode data/sandbox"
function p.chars_in_scripts_in_block(frame)
local fun = require "Module:fun"
local block_info = get_block_info_from_arg(frame.args, 1)
local m_table = require "Module:TableTools"
local show_block_name = get_boolean_from_arg(frame.args, 2)
 
local script_char_set_map = print_char_set_map(
local function errorf(level, ...)
get_chars_in_scripts(fun.range(block_info[1], block_info[2])))
if show_block_nametype(level) == "number" then
return error("%s: %s"):string.format(block_info[3]...), script_char_set_maplevel + 1)
else -- level is actually the format string.
else
return script_char_set_maperror(string.format(level, ...), 2)
end
end
Line 340 ⟶ 80:
local language_codes = {}
for lang_template in content:gmatch '"{{lang[^}]+'" do
local template_name = lang_template:match('"{{([^|}]+)'")
local language_code
if template_name == '"lang'" then
language_code = lang_template:match '"{{lang|([^|}]+)'"
elseif template_name:find '"^lang-'" then
language_code = lang_template:match '"{{lang-([^|}]+)'"
end
if language_code then
Line 353 ⟶ 93:
end
return table.concat(m_table.keysToList(language_codes), '", '")
end
 
local parsed_subtags_mt = {
__index = {
-- "error" is the error message.
-- "index" is the ordinal of the subtag in which the error was found.
throw = function (self, error, index)
self.error = self.error_messages[error]
self.invalid = table.concat(self.input, "-", index)
return self:remove_unnecessary_fields()
Line 367 ⟶ 109:
-- Only useful internally.
self.input = nil
self:pretty_print()
return setmetatable(self, nil)
p.validate_lang_tag(self)
return self
end,
-- Regularize capitalization of language subtags:
-- ZH-LATN -> zh-Latn, FR-ca -> fr-CA
pretty_print = function (self)
for key, func in pairs(self.print_funcs) do
if self[key] then
self[key] = func(self[key])
end
end
return self
end,
-- Re-create the original tag from the parsed subtags.
get_tag = function (self)
if self.tag then return self.tag end
local tag = {}
for _, subtag_name in ipairs(self.subtag_order) do
if subtag_name == "private_use" then
table.insert(tag, "x")
end
if type(self[subtag_name]) == "table" then
for _, subtag in ipairs(self[subtag_name]) do
table.insert(tag, subtag)
end
else
table.insert(tag, self[subtag_name])
end
end
tag = table.concat(tag, "-")
self.tag = tag -- Cache the result.
return tag
end,
subtag_order = {
"language", "script", "region", "variant", "private_use"
},
error_messages = {
invalid_characters = "invalid characters",
no_language = "no language subtag",
invalid_subtag = "invalid subtag",
invalid_private_use = "length of private-use subtag out of range",
empty_private_use = "empty private-use subtag",
}
}
}
local function initial_caps_helper(initial, rest)
return string.upper(initial) .. string.lower(rest)
end
local function lower_or_map_lower(str)
if type(str) == "table" then
return fun.map(string.lower, str)
else
return string.lower(str)
end
end
parsed_subtags_mt.__index.print_funcs = {
language = string.lower,
script = function (script_code)
return (string.gsub(script_code, "^(%a)(%a%a%a)$", initial_caps_helper))
end,
region = string.upper,
variant = lower_or_map_lower,
private_use = lower_or_map_lower,
}
 
Line 378 ⟶ 189:
})
-- An array of patterns for each subtag, and a "type" field for the name
-- of the subtag.
-- The patterns are checked in order, and any of the subtags can be skipped.
-- So, for example, the "language" subtag must precede the "script"
-- subtag, but a tag may contain a "language" subtag, no "script" subtag
-- and then a "region" subtag.
-- If the full list of subtags has been iterated over, the remaining subtags
-- must match the pattern for a private-use subtag, or the tag is invalid.
local subtag_info = { -- can be put in data module
{ "%a%a%a?", "1%a+", type = "language" }, -- ll or lll; special case
-- include extlang?
{ "%a%a%a%a", type = "script" }, -- Ssss
{ "%a%a", "%d%d%d", type = "region" }, -- rr, DDD
{
"%d%d%d%d", -- 4 digits
"%w%w%w%w%w%w?%w?%w?", -- 5-8 alnum characters
type = "variant",
repeatable = true, -- There can be multiple variants.
}
}
 
-- A previous draft, in [[Module:Lang/sandbox]]:
Line 394 ⟶ 225:
-- Does not recognize "extension" tags, such as those introduced by "u", as they
-- are not needed on Wikipedia. Does not recognize "grandfathered" tags.
-- Does not recognize extended language subtags, such as "zh-yue".
-- https://www.rfc-editor.org/rfc/rfc6067.txt, https://tools.ietf.org/html/bcp47
 
-- Only checks that the syntax is correct, not that the values are valid. For
-- instance, doeswill notaccept check that anon-existent language codecodes, actuallylike exists"zz".
function p.parse_IETF(tag)
if type(tag) ~= "string" or tag == "" then
Line 409 ⟶ 241:
-- order:
-- "language", "script", "region", "variant", "private_use", "invalid"
-- All these subtags can be strings or nil, while "variant" can also be an
-- array of strings if more than one variant subtag was found.
-- "invalid" is the portion of the tag after the last valid subtag (minus a
-- hyphen).
local potential_subtagssegments = mw.text.split(tag, "-")
local parsed_subtags = parsed_subtags_mt(potential_subtagssegments)
local matched_count = 0
-- Language tags probably only contain ASCII alphabetic and numerical
-- characters and hyphen-minus.
if not tag:find '"^[A-Za-z0-9-]+$'" then
return parsed_subtags:throw("invalid characters", 1)
"invalid_characters",
fun.indexOf(
function (tag)
return tag:find "[^A-Za-z0-9-]"
end,
segments))
end
local subtag_i = 1 -- Index of current item in subtag_info.
-- An array of patterns for each subtag, and a "type" field for the name
local segment_i = 1 -- Index of current segment.
-- of the subtag.
while segments[segment_i] and subtag_info[subtag_i] do
-- The patterns are checked in order, and any of the subtags can be skipped.
local segment = segments[segment_i]
-- So, for example, the "language" subtag must precede the "script"
local subtag_type
-- subtag, but a tag may contain a "language" subtag, no "script" subtag
while not subtag_type and subtag_info[subtag_i] do
-- and then a "region" subtag.
-- Check each pattern for the subtag type at "subtag_i" in "subtag_info".
-- If the full list of subtags has been iterated over, the remaining subtags
local cur_subtag = subtag_info[subtag_i]
-- must match the pattern for a private-use subtag, or the tag is invalid.
for _, pattern in ipairs(cur_subtag) do
local subtag_info = { -- can be put in data module
if segment:find("^" .. pattern .. "$") then
{ "%a%a%a?", "1%a+", type = "language" }, -- ll or lll; special case
subtag_type = cur_subtag.type
-- include extlang?
{ "%a%a%a%a", type = "script" }, -- Ssss
{ "%a%a", "%d%d%d", type = "region" }, -- rr, DDD
{
"%d%d%d%d", -- 4 digits
"%w%w%w%w%w%w?%w?%w?", -- 5-8 alnum characters
type = "variant"
}
}
local index = 1
local last_matched_subtag_i = 0
for subtag_i, subtag in ipairs(potential_subtags) do
local type
local matched = false
while not matched do
-- Check each pattern for the subtag type at "index" in "subtag_info".
for _, pattern in ipairs(subtag_info[index]) do
if subtag:find("^" .. pattern .. "$") then
type = subtag_info[index].type
matched = true
-- There can be multiple "variant" subtags (and "extension"
-- subtags, if those are added).
if typenot ~= "variant"cur_subtag.repeatable then
indexsubtag_i = indexsubtag_i + 1
end
break
end
end
if not matched then -- Go to next item in subtag_info.
if not subtag_type then -- No match; try next subtag.
index = index + 1
subtag_i = subtag_i + 1
if not subtag_info[index] then
break
end
end
end
-- If language subtag has not been found, or the current segment has not
if type then
-- been matched as a subtag, break the loop and check for
if parsed_subtags[type] then
-- a private-use subtag.
parsed_subtags[type] = { parsed_subtags[type] }
if segment_i == 1 and subtag_type ~= "language" or not subtag_type then
table.insert(parsed_subtags[type], subtag)
break
else
if parsed_subtags[subtag_type] then -- Create an array.
if type(parsed_subtags[subtag_type]) == "string" then
parsed_subtags[subtag_type] = { parsed_subtags[subtag_type] }
end -- else table
table.insert(parsed_subtags[subtag_type], segment)
else
parsed_subtags[typesubtag_type] = subtagsegment
end
last_matched_segment_i = segment_i
last_matched_subtag_i = subtag_i
matched_count = matched_count + 1
elseif not subtag_info[index] then
break
end
segment_i = segment_i + 1
end
if segments[segment_i] then -- More segments to scan?
if #potential_subtags > matched_count then
-- Not all potential subtags were matched. Check for private-use subtags.
-- https://tools.ietf.org/html/bcp47#section-2.2.7
-- Private-use subtags consist of "x-" followed by one or more subtagssequences of 1 to 8
-- alphanumeric characters preceded by "x-".
-- consisting of 1 to 8 alphanumeric characters.
-- Alphanumericity has already been checked.
if potential_subtags[last_matched_subtag_i + 1] == "x" then
-- A tag must start with either a language subtag or a private-use subtag.
-- Check length of all following subtags.
-- If next segment is not "x", introducing a private-use subtag, there
if not potential_subtags[last_matched_subtag_i + 2] then
return parsed_subtags:throw("empty-- is no private-use subtag",.
if segments[segment_i] and segments[segment_i]:lower() ~= "x" then
last_matched_subtag_i + 1)
if not parsed_subtags.language then
return parsed_subtags:throw("no_language", 1)
else
return parsed_subtags:throw("invalid_subtag",
segment_i)
end
elseif not segments[segment_i + 1] then
return parsed_subtags:throw("empty_private_use",
segment_i)
end
-- Check length of all segments after "x".
for i = segment_i + 1, #segments do
local length = #segments[i]
if not (1 <= length and length <= 8) then
for i = last_matched_subtag_i + 2, #potential_subtags do
return parsed_subtags
local length = #potential_subtags[i]
:throw("invalid_private_use", segment_i)
end
end
if not segments[last_matched_segment_i + 3] then -- There is only one private-use subtag.
parsed_subtags.private_use = segments[segment_i + 1]
else
parsed_subtags.private_use = {}
for i = segment_i + 1, #segments do
table.insert(parsed_subtags.private_use, segments[i])
end
end
end
return parsed_subtags:remove_unnecessary_fields()
end
 
 
local lang_name_table = mw.loadData "Module:Language/name/data"
local synonym_table = mw.loadData "Module:Lang/ISO 639 synonyms"
local lang_data = mw.loadData "Module:Lang/data"
 
function p.validate_lang_tag(parsed_subtags)
-- Already checked that the tag starts with a language subtag or a private-use subtag.
-- Script code is initially capitalized, region code is uppercase,
-- everything else is lowercase.
-- Check existence of language tag.
if parsed_subtags.language and
not (lang_data.override[parsed_subtags.language]
or lang_name_table.lang[parsed_subtags.language]) then
mw.log("Invalid language code", parsed_subtags.language, "in", parsed_subtags:get_tag())
end
-- Check existence of script tag.
if parsed_subtags.script then
local lower_script = parsed_subtags.script:lower()
if not lang_name_table.script[lower_script] then
mw.log("Invalid script code", parsed_subtags.script, "in", parsed_subtags:get_tag())
end
-- Check that script tag is not marked as superfluous (because the
-- it is considered the default one for the language).
if lang_name_table.suppressed[lower_script]
and parsed_subtags.language
and m_table.inArray(
lang_name_table.suppressed[lower_script],
parsed_subtags.language:lower()) then
mw.log(parsed_subtags.script, "is suppressed with",
parsed_subtags.language, "in", parsed_subtags:get_tag())
end
end
-- Check existence of region code..
if parsed_subtags.region and not lang_name_table.region[parsed_subtags.region:lower()] then
mw.log("Invalid region code", parsed_subtags.region, "in", parsed_subtags:get_tag())
end
-- Check that variant code is valid, and that it can validly be used with the
-- given combination of language, script, region, and variant.
-- Check for duplicate variant subtags?
if parsed_subtags.variant then
local lower_tag = parsed_subtags:get_tag():lower()
for _, variant in ipairs(type(parsed_subtags.variant) == "table"
and parsed_subtags.variant or { parsed_subtags.variant }) do
if not lang_name_table.variant[variant] then
mw.log("Invalid variant code", variant, "in", parsed_subtags:get_tag())
else
local prefix = parsed_subtags:get_tag():lower():match("^(.-)%-" .. variant)
-- Check that at least one of the prefixes is found at the
if not (1 <= length and length <= 8) then
-- beginning of lower_tag.
return parsed_subtags
if not fun.some(function (prefix)
:throw("length of private-use subtag out of range",
return lower_tag:find(prefix, 1, true) == 1
last_matched_subtag_i + 1)
end,
lang_name_table.variant[variant].prefixes) then
mw.log("Variant tag", variant, "does not belong with prefix",
prefix, "in", parsed_subtags:get_tag())
end
end
parsed_subtags.private_use = table.concat(potential_subtags, "-",
last_matched_subtag_i + 1)
else
return parsed_subtags:throw("invalid subtag",
last_matched_subtag_i + 1)
end
end
-- Check that the private-use subtag is actually used by Wikipedia.
if not (parsed_subtags.language or parsed_subtags.private_use) then
if parsed_subtags.private_use and not lang_data.override[parsed_subtags.tag] then
return parsed_subtags:throw("no language subtag", 1)
mw.log("Invalid private-use subtag in", parsed_subtags:get_tag())
end
end
 
function p.show_COinS(frame)
local ref = frame.args[1]
local tag = ref:match('<span [^>]*class="Z3988"[^>]*>')
return parsed_subtags:remove_unnecessary_fields()
local data = tag:match('title="(.-)"')
local vals = {}
for item in mw.text.gsplit(data, "&") do
local key, value = item:match("(.-)=(.*)")
vals[key] = mw.uri.decode(value)
end
return ref .. "\n\n" .. table.concat(
require "Module:fun".mapIter(
function (value, key)
return ("%s: %s"):format(key, value)
end,
m_table.sortedPairs(
vals)),
", ")
end