Module:Sandbox/Erutuon: Difference between revisions

Content deleted Content added
allow inputting of pagename; error catching
function to print data module for "default ignorable" property
 
(64 intermediate revisions by 2 users not shown)
Line 1:
local p = {}
 
function p.show(frame)
local Unicode_data = require 'Module:Unicode data'
local page = frame.args[1] or "User:Erutuon/Unicode/DerivedCoreProperties.txt"
local text = assert(mw.title.new(page):getContent())
local defaultIgnorable = text
:match("Derived Property: Default_Ignorable_Code_Point.-(%f[^\n]%x%x%x%x.-)%s*\n# Total code points")
local singles, ranges = {}, {}
for codePoint1, codePoint2 in defaultIgnorable:gmatch("%f[^\n%z](%x+)%.?%.?(%x*)") do
codePoint1, codePoint2 = tonumber(codePoint1, 16), tonumber(codePoint2, 16)
local lastRange = ranges[#ranges]
if lastRange and lastRange[2] == codePoint1 - 1 then
lastRange[2] = codePoint2 or codePoint1
else
if not codePoint2 then
singles[codePoint1] = true
else
table.insert(ranges, { codePoint1, codePoint2 })
end
end
end
local template = [[
local data = {}
 
data.defaultIgnorable = {
singles = {
...
},
ranges = {
...
},
}
 
return data
]]
 
local Array = require "Module:array"
local printedRanges = Array()
for _, range in ipairs(ranges) do
local low, high, script_code = unpack(range)
printedRanges:insert(('\t\t{ 0x%05X, 0x%05X },'):format(low, high))
end
local printedSingles = Array()
for codepoint in require 'Module:TableTools'.sortedPairs(singles) do
printedSingles:insert(('\t\t[0x%05X] = true,'):format(codepoint))
end
local data = template
:gsub('%.%.%.', printedSingles:concat('\n'), 1)
:gsub('%.%.%.', printedRanges:concat('\n'), 1)
return data
end
 
local Unicode_data = require "Module:Unicode data/sandbox"
local fun = require "Module:fun"
local m_table = require "Module:TableTools"
 
local function errorf(level, ...)
if type(level) == "number" then
return error(string.format(...), level + 1)
else -- level is actually the format string.
Line 11 ⟶ 68:
end
 
function p.search_for_language_codes(frame)
local Latn_pattern = table.concat {
local page_name = frame.args[1] or "English language"
'[', -- this is a set so include opening bracket
'\n\32-\127', -- C0 Controls and Basic Latin U+0020–U+007E (20 - 7E) + (U+0010 and U+007F <poem>...</poem> support)
local success, title_object = pcall(mw.title.new, page_name)
'\194\160-\194\172', -- C1 Controls and Latin-1 Supplement U+00A0-U+00AC (C2 A0 - C2 AC)
if not (success and title_object) then
'\195\128-\195\191', -- (skip shy) U+00C0–U+00FF (C3 80 - C3 BF)
mw.logf("Could not make title object for '%s'.", page_name)
'\196\128-\197\191', -- Latin Extended-A U+0100–U+017F (C4 80 - C5 BF)
return
'\198\128-\201\143', -- Latin Extended-B U+0180–U+024F (C6 80 - C9 8F)
'\225\184\128-\225\187\191', -- Latin Extended Additional U+1E00-U+1EFF (E1 B8 80 - E1 BB BF)
'\226\177\160-\226\177\191', -- Latin Extended-C U+2C60–U+2C7F (E2 B1 A0 - E2 B1 BF)
'\234\156\160-\234\159\191', -- Latin Extended-D U+A720-U+A7FF (EA 9C A0 - EA 9F BF)
'\234\172\176-\234\173\175', -- Latin Extended-E U+AB30-U+AB6F (EA AC B0 - EA AD AF)
'\239\172\128-\239\172\134', -- Alphabetic Presentaion Forms U+FB00-U+FB06 (EF AC 80 - EF AC 86)
'\239\188\129-\239\188\188', -- Halfwidth and Fullwidth Forms U+FF01-U+FF3C (EF BC 81 - EF BC BC)
'–', -- ndash
'—', -- mdash
'«', '»', -- guillemets commonly used in several 'Latn' languages
']', -- close the set
};
 
local get_codepoint = mw.ustring.codepoint
local function expand_range(start, ending)
local lower, higher = get_codepoint(start), get_codepoint(ending)
if higher < lower then
return nil
end
local chars = {}
local icontent = 0title_object:getContent()
for codepoint = lower, higher do
local language_codes = {}
i = i + 1
for lang_template in content:gmatch "{{lang[^}]+" do
chars[i] = mw.ustring.char(codepoint)
local template_name = lang_template:match("{{([^|}]+)")
local language_code
if template_name == "lang" then
language_code = lang_template:match "{{lang|([^|}]+)"
elseif template_name:find "^lang-" then
language_code = lang_template:match "{{lang-([^|}]+)"
end
if language_code then
language_codes[language_code] = true
end
end
return table.concat(chars)
return table.concat(m_table.keysToList(language_codes), ", ")
end
 
local parsed_subtags_mt = {
local fun = require "Module:Fun"
__index = {
local m_table = require "Module:Table"
-- "error" is the error message.
 
-- "index" is the ordinal of the subtag in which the error was found.
local script_to_count_mt = {
__index throw = function (self, keyerror, index)
self[key].error = 0self.error_messages[error]
self.invalid = table.concat(self.input, "-", index)
return 0
return self:remove_unnecessary_fields()
end,
end,
__call = function (self, ...)
return setmetatable({}, self)
remove_unnecessary_fields = function (self)
end
-- Only useful internally.
self.input = nil
self:pretty_print()
p.validate_lang_tag(self)
return self
end,
-- Regularize capitalization of language subtags:
-- ZH-LATN -> zh-Latn, FR-ca -> fr-CA
pretty_print = function (self)
for key, func in pairs(self.print_funcs) do
if self[key] then
self[key] = func(self[key])
end
end
return self
end,
-- Re-create the original tag from the parsed subtags.
get_tag = function (self)
if self.tag then return self.tag end
local tag = {}
for _, subtag_name in ipairs(self.subtag_order) do
if subtag_name == "private_use" then
table.insert(tag, "x")
end
if type(self[subtag_name]) == "table" then
for _, subtag in ipairs(self[subtag_name]) do
table.insert(tag, subtag)
end
else
table.insert(tag, self[subtag_name])
end
end
tag = table.concat(tag, "-")
self.tag = tag -- Cache the result.
return tag
end,
subtag_order = {
"language", "script", "region", "variant", "private_use"
},
error_messages = {
invalid_characters = "invalid characters",
no_language = "no language subtag",
invalid_subtag = "invalid subtag",
invalid_private_use = "length of private-use subtag out of range",
empty_private_use = "empty private-use subtag",
}
}
}
local function initial_caps_helper(initial, rest)
setmetatable(script_to_count_mt, script_to_count_mt)
return string.upper(initial) .. string.lower(rest)
 
end
-- Uses an iterator (such as mw.ustring.gcodepoint) that generates a codepoint
local function lower_or_map_lower(str)
-- each time it is called with an optional state and another value.
if type(str) == "table" then
local function show_scripts(iterator, state, value)
return fun.map(string.lower, str)
local script_to_count = script_to_count_mt()
else
for codepoint in iterator, state, value do
return string.lower(str)
local script = Unicode_data.lookup_script(codepoint)
script_to_count[script] = script_to_count[script] + 1
end
return table.concat(
fun.mapIter(
function (count, script)
return ("%s (%d)"):format(script, count)
end,
m_table.sortedPairs(
script_to_count,
function (script1, script2)
return script_to_count[script1] > script_to_count[script2]
end)),
", ")
end
parsed_subtags_mt.__index.print_funcs = {
language = string.lower,
script = function (script_code)
return (string.gsub(script_code, "^(%a)(%a%a%a)$", initial_caps_helper))
end,
region = string.upper,
variant = lower_or_map_lower,
private_use = lower_or_map_lower,
}
 
setmetatable(parsed_subtags_mt, {
local function get_chars_in_scripts(iterator, state, value)
__call = function (self, input)
local script_to_char_set = {}
return setmetatable({ input = input }, self)
for codepoint in iterator, state, value do
local script = Unicode_data.lookup_script(codepoint)
script_to_char_set[script] = script_to_char_set[script] or {}
script_to_char_set[script][codepoint] = true
end
})
-- An array of patterns for each subtag, and a "type" field for the name
return script_to_char_set
-- of the subtag.
end
-- The patterns are checked in order, and any of the subtags can be skipped.
-- So, for example, the "language" subtag must precede the "script"
-- subtag, but a tag may contain a "language" subtag, no "script" subtag
-- and then a "region" subtag.
-- If the full list of subtags has been iterated over, the remaining subtags
-- must match the pattern for a private-use subtag, or the tag is invalid.
local subtag_info = { -- can be put in data module
{ "%a%a%a?", "1%a+", type = "language" }, -- ll or lll; special case
-- include extlang?
{ "%a%a%a%a", type = "script" }, -- Ssss
{ "%a%a", "%d%d%d", type = "region" }, -- rr, DDD
{
"%d%d%d%d", -- 4 digits
"%w%w%w%w%w%w?%w?%w?", -- 5-8 alnum characters
type = "variant",
repeatable = true, -- There can be multiple variants.
}
}
 
-- A previous draft, in [[Module:Lang/sandbox]]:
local function print_char_set_map(script_to_char_set, format, separator)
-- https://en.wikipedia.org/w/index.php?oldid=812819217
format = format or "%s: %s"
separator = separator or "\n"
return table.concat(
fun.mapIter(
function (char_set, script)
local char_list = fun.mapIter(
function (_, codepoint)
return mw.ustring.char(codepoint)
end,
m_table.sortedPairs(char_set))
return (format):format(script, mw.text.nowiki(table.concat(char_list)))
end,
m_table.sortedPairs(script_to_char_set)),
separator)
end
 
-- Based on https://www.w3.org/International/articles/language-tags/.
function p.show(frame)
 
local expanded_pattern = Latn_pattern
-- Parse a language tag.
:gsub('%[(.-)%]', '%1')
-- Returns nil if tag is not a string or empty.
:gsub( -- Find two UTF-8-encoded characters separated by hyphen-minus.
-- Else returns a table with a map of subtag type to subtag for all subtags that
'([%z\1-\127\194-\244][\128-\191]*)%-([%z\1-\127\194-\244][\128-\191]*)',
-- were parsed.
function (char1, char2)
-- If there was an error, returns an "error" field with a description of the
return expand_range(char1, char2)
-- error, and an "invalid" field with the suffix of the tag starting at the
end)
-- index where the error occurred.
 
-- Does not recognize "extension" tags, such as those introduced by "u", as they
-- are not needed on Wikipedia. Does not recognize "grandfathered" tags.
-- Does not recognize extended language subtags, such as "zh-yue".
-- https://www.rfc-editor.org/rfc/rfc6067.txt, https://tools.ietf.org/html/bcp47
 
-- Only checks that the syntax is correct, not that the values are valid. For
-- instance, will accept non-existent language codes, like "zz".
function p.parse_IETF(tag)
if type(tag) ~= "string" or tag == "" then
return nil
end
-- This may contain the special fields "invalid", "error".
--[[
-- "error" indicates why the
mw.log(
-- tag is invalid (if applicable).
print_char_set_map(
-- All other fields are subtags, and they appear in the tag in the following
get_chars_in_scripts(
-- order:
expanded_pattern:gmatch "[%z\1-\127\194-\244][\128-\191]*")))
-- "language", "script", "region", "variant", "private_use", "invalid"
--]]
-- All these subtags can be strings or nil, while "variant" can also be an
-- array of strings if more than one variant subtag was found.
-- "invalid" is the portion of the tag after the last valid subtag (minus a
-- hyphen).
local segments = mw.text.split(tag, "-")
local parsed_subtags = parsed_subtags_mt(segments)
-- Language tags probably only contain ASCII alphabetic and numerical
return ('* <div style="overflow-wrap: break-word;">%s</div><br>%s')
-- characters and hyphen-minus.
:format(expanded_pattern
if not tag:find "^[A-Za-z0-9-]+$" then
:gsub('^%s*', ''), -- Remove initial '\n ' to avoid creating unwanted pre element.
return parsed_subtags:throw(
show_scripts(mw.ustring.gcodepoint(expanded_pattern)))
"invalid_characters",
end
fun.indexOf(
 
local function get_block_info_from_arg(args, argtag)
return tag:find "[^A-Za-z0-9-]"
local block_name = args[1]
end,
or errorf("Parameter %s is required", tostring(arg))
segments))
end
local subtag_i = 1 -- Index of current item in subtag_info.
local block_info = Unicode_data.get_block_info(block_name)
local segment_i = 1 -- Index of current segment.
or errorf("The block '%s' could be found", block_name)
while segments[segment_i] and subtag_info[subtag_i] do
local segment = segments[segment_i]
local subtag_type
while not subtag_type and subtag_info[subtag_i] do
-- Check each pattern for the subtag type at "subtag_i" in "subtag_info".
local cur_subtag = subtag_info[subtag_i]
for _, pattern in ipairs(cur_subtag) do
if segment:find("^" .. pattern .. "$") then
subtag_type = cur_subtag.type
-- There can be multiple "variant" subtags (and "extension"
-- subtags, if those are added).
if not cur_subtag.repeatable then
subtag_i = subtag_i + 1
end
break
end
end
if not subtag_type then -- No match; try next subtag.
subtag_i = subtag_i + 1
end
end
-- If language subtag has not been found, or the current segment has not
-- been matched as a subtag, break the loop and check for
-- a private-use subtag.
if segment_i == 1 and subtag_type ~= "language" or not subtag_type then
break
else
if parsed_subtags[subtag_type] then -- Create an array.
if type(parsed_subtags[subtag_type]) == "string" then
parsed_subtags[subtag_type] = { parsed_subtags[subtag_type] }
end -- else table
table.insert(parsed_subtags[subtag_type], segment)
else
parsed_subtags[subtag_type] = segment
end
last_matched_segment_i = segment_i
end
segment_i = segment_i + 1
end
if segments[segment_i] then -- More segments to scan?
return block_info
-- Not all potential subtags were matched. Check for private-use subtags.
-- https://tools.ietf.org/html/bcp47#section-2.2.7
-- Private-use subtags consist of one or more sequences of 1 to 8
-- alphanumeric characters preceded by "x-".
-- Alphanumericity has already been checked.
-- A tag must start with either a language subtag or a private-use subtag.
-- If next segment is not "x", introducing a private-use subtag, there
-- is no private-use subtag.
if segments[segment_i] and segments[segment_i]:lower() ~= "x" then
if not parsed_subtags.language then
return parsed_subtags:throw("no_language", 1)
else
return parsed_subtags:throw("invalid_subtag",
segment_i)
end
elseif not segments[segment_i + 1] then
return parsed_subtags:throw("empty_private_use",
segment_i)
end
-- Check length of all segments after "x".
for i = segment_i + 1, #segments do
local length = #segments[i]
if not (1 <= length and length <= 8) then
return parsed_subtags
:throw("invalid_private_use", segment_i)
end
end
if not segments[last_matched_segment_i + 3] then -- There is only one private-use subtag.
parsed_subtags.private_use = segments[segment_i + 1]
else
parsed_subtags.private_use = {}
for i = segment_i + 1, #segments do
table.insert(parsed_subtags.private_use, segments[i])
end
end
end
return parsed_subtags:remove_unnecessary_fields()
end
 
local function get_boolean_from_arg(args, arg)
return args[arg] and require "Module:Yesno" (args[arg])
end
 
local lang_name_table = mw.loadData "Module:Language/name/data"
function p.scripts_in_block(frame)
local synonym_table = mw.loadData "Module:Lang/ISO 639 synonyms"
local block_info = get_block_info_from_arg(frame.args, 1)
local lang_data = mw.loadData "Module:Lang/data"
local show_block_name = get_boolean_from_arg(frame.args, 2)
local script_list = show_scripts(fun.range(block_info[1], block_info[2]))
if show_block_name then
return ("%s: %s"):format(block_info[3], script_list)
else
return script_list
end
end
 
function p.chars_in_scripts_in_blockvalidate_lang_tag(frameparsed_subtags)
-- Already checked that the tag starts with a language subtag or a private-use subtag.
local block_info = get_block_info_from_arg(frame.args, 1)
-- Script code is initially capitalized, region code is uppercase,
local show_block_name = get_boolean_from_arg(frame.args, 2)
-- everything else is lowercase.
local script_char_set_map = print_char_set_map(
get_chars_in_scripts(fun.range(block_info[1], block_info[2])))
-- Check existence of language tag.
if show_block_name then
if parsed_subtags.language and
return ("%s: %s"):format(block_info[3], script_char_set_map)
not (lang_data.override[parsed_subtags.language]
else
or lang_name_table.lang[parsed_subtags.language]) then
return script_char_set_map
mw.log("Invalid language code", parsed_subtags.language, "in", parsed_subtags:get_tag())
end
end
 
function p.search_for_language_codes(frame)
local page_name = frame.args[1] or "English language"
-- Check existence of script tag.
local success, title_object = pcall(mw.title.new, page_name)
if not (success and title_object)parsed_subtags.script then
local lower_script = parsed_subtags.script:lower()
mw.log(("Could not make title object for '%s'"):format(page_name))
if not lang_name_table.script[lower_script] then
return
mw.log("Invalid script code", parsed_subtags.script, "in", parsed_subtags:get_tag())
end
-- Check that script tag is not marked as superfluous (because the
-- it is considered the default one for the language).
if lang_name_table.suppressed[lower_script]
and parsed_subtags.language
and m_table.inArray(
lang_name_table.suppressed[lower_script],
parsed_subtags.language:lower()) then
mw.log(parsed_subtags.script, "is suppressed with",
parsed_subtags.language, "in", parsed_subtags:get_tag())
end
end
-- Check existence of region code..
local content = title_object:getContent()
if parsed_subtags.region and not lang_name_table.region[parsed_subtags.region:lower()] then
mw.log("Invalid region code", parsed_subtags.region, "in", parsed_subtags:get_tag())
end
-- Check that variant code is valid, and that it can validly be used with the
local language_codes = {}
-- given combination of language, script, region, and variant.
for lang_template in content:gmatch '{{lang[^}]+' do
-- Check for duplicate variant subtags?
local template_name = lang_template:match('{{([^|}]+)')
if parsed_subtags.variant then
local language_code
local lower_tag = parsed_subtags:get_tag():lower()
if template_name == 'lang' then
language_code = lang_template:match '{{lang|([^|}]+)'
for _, variant in ipairs(type(parsed_subtags.variant) == "table"
elseif template_name:find '^lang-' then
and parsed_subtags.variant or { parsed_subtags.variant }) do
language_code = lang_template:match '{{lang-([^|}]+)'
if not lang_name_table.variant[variant] then
end
mw.log("Invalid variant code", variant, "in", parsed_subtags:get_tag())
if language_code then
else
language_codes[language_code] = true
local prefix = parsed_subtags:get_tag():lower():match("^(.-)%-" .. variant)
-- Check that at least one of the prefixes is found at the
-- beginning of lower_tag.
if not fun.some(function (prefix)
return lower_tag:find(prefix, 1, true) == 1
end,
lang_name_table.variant[variant].prefixes) then
mw.log("Variant tag", variant, "does not belong with prefix",
prefix, "in", parsed_subtags:get_tag())
end
end
end
end
-- Check that the private-use subtag is actually used by Wikipedia.
return table.concat(m_table.keysToList(language_codes), ', ')
if parsed_subtags.private_use and not lang_data.override[parsed_subtags.tag] then
mw.log("Invalid private-use subtag in", parsed_subtags:get_tag())
end
end
 
function p.show_COinS(frame)
local ref = frame.args[1]
local tag = ref:match('<span [^>]*class="Z3988"[^>]*>')
local data = tag:match('title="(.-)"')
local vals = {}
for item in mw.text.gsplit(data, "&") do
local key, value = item:match("(.-)=(.*)")
vals[key] = mw.uri.decode(value)
end
return ref .. "\n\n" .. table.concat(
require "Module:fun".mapIter(
function (value, key)
return ("%s: %s"):format(key, value)
end,
m_table.sortedPairs(
vals)),
", ")
end