local p = {}
local Unicode_data = require 'Module:Unicode data/sandbox'
local function errorf(level, ...)
if type(level) == number then
return error(string.format(...), level + 1)
else -- level is actually the format string.
return error(string.format(level, ...), 2)
end
end
function mw.logf(...)
return mw.log(string.format(...))
end
local output_mt = {}
function output_mt:insert(str)
self.n = self.n + 1
self[self.n] = str
end
-- also in [[Module:Unicode data/documentation functions]]
function output_mt:insert_format(...)
self:insert(string.format(...))
end
output_mt.join = table.concat
output_mt.__index = output_mt
local function Output()
return setmetatable({ n = 0 }, output_mt)
end
local Latn_pattern = table.concat {
'[', -- this is a set so include opening bracket
'\n\32-\127', -- C0 Controls and Basic Latin U+0020–U+007E (20 - 7E) + (U+0010 and U+007F <poem>...</poem> support)
'\194\160-\194\172', -- C1 Controls and Latin-1 Supplement U+00A0-U+00AC (C2 A0 - C2 AC)
'\195\128-\195\191', -- (skip shy) U+00C0–U+00FF (C3 80 - C3 BF)
'\196\128-\197\191', -- Latin Extended-A U+0100–U+017F (C4 80 - C5 BF)
'\198\128-\201\143', -- Latin Extended-B U+0180–U+024F (C6 80 - C9 8F)
'\225\184\128-\225\187\191', -- Latin Extended Additional U+1E00-U+1EFF (E1 B8 80 - E1 BB BF)
'\226\177\160-\226\177\191', -- Latin Extended-C U+2C60–U+2C7F (E2 B1 A0 - E2 B1 BF)
'\234\156\160-\234\159\191', -- Latin Extended-D U+A720-U+A7FF (EA 9C A0 - EA 9F BF)
'\234\172\176-\234\173\175', -- Latin Extended-E U+AB30-U+AB6F (EA AC B0 - EA AD AF)
'\239\172\128-\239\172\134', -- Alphabetic Presentaion Forms U+FB00-U+FB06 (EF AC 80 - EF AC 86)
'\239\188\129-\239\188\188', -- Halfwidth and Fullwidth Forms U+FF01-U+FF3C (EF BC 81 - EF BC BC)
'–', -- ndash
'—', -- mdash
'«', '»', -- guillemets commonly used in several 'Latn' languages
']', -- close the set
};
local get_codepoint = mw.ustring.codepoint
local function expand_range(start, ending)
local lower, higher = get_codepoint(start), get_codepoint(ending)
if higher < lower then
return nil
end
local chars = {}
local i = 0
for codepoint = lower, higher do
i = i + 1
chars[i] = mw.ustring.char(codepoint)
end
return table.concat(chars)
end
local fun = require "Module:Fun"
local m_table = require "Module:Table"
local script_to_count_mt = {
__index = function (self, key)
self[key] = 0
return 0
end,
__call = function (self, ...)
return setmetatable({}, self)
end
}
setmetatable(script_to_count_mt, script_to_count_mt)
-- Uses an iterator (such as mw.ustring.gcodepoint) that generates a codepoint
-- each time it is called with an optional state and another value.
local function show_scripts(iterator, state, value)
local script_to_count = script_to_count_mt()
for codepoint in iterator, state, value do
local script = Unicode_data.lookup_script(codepoint)
script_to_count[script] = script_to_count[script] + 1
end
return table.concat(
fun.mapIter(
function (count, script)
return ("%s (%d)"):format(script, count)
end,
m_table.sortedPairs(
script_to_count,
function (script1, script2)
return script_to_count[script1] > script_to_count[script2]
end)),
", ")
end
local function get_chars_in_scripts(iterator, state, value)
local script_to_char_set = {}
for codepoint in iterator, state, value do
local script = Unicode_data.lookup_script(codepoint)
script_to_char_set[script] = script_to_char_set[script] or {}
script_to_char_set[script][codepoint] = true
end
return script_to_char_set
end
local function print_char_set_map(script_to_char_set, format, separator)
format = format or "%s: %s"
separator = separator or "\n"
return table.concat(
fun.mapIter(
function (char_set, script)
local char_list = fun.mapIter(
function (_, codepoint)
return mw.ustring.char(codepoint)
end,
m_table.sortedPairs(char_set))
return (format):format(script, mw.text.nowiki(table.concat(char_list)))
end,
m_table.sortedPairs(script_to_char_set)),
separator)
end
function p.show(frame)
local expanded_pattern = Latn_pattern
:gsub('%[(.-)%]', '%1')
:gsub( -- Find two UTF-8-encoded characters separated by hyphen-minus.
'([%z\1-\127\194-\244][\128-\191]*)%-([%z\1-\127\194-\244][\128-\191]*)',
function (char1, char2)
return expand_range(char1, char2)
end)
return ('* <div style="overflow-wrap: break-word;">%s</div><br>%s')
:format(expanded_pattern
:gsub('^%s*', ''), -- Remove initial '\n ' to avoid creating unwanted pre element.
show_scripts(mw.ustring.gcodepoint(expanded_pattern)))
end
local function get_block_info_from_arg(args, arg)
local block_name = args[1]
or errorf("Parameter %s is required", tostring(arg))
local block_info = Unicode_data.get_block_info(block_name)
or errorf("The block '%s' could be found", block_name)
return block_info
end
local function get_boolean_from_arg(args, arg)
return args[arg] and require "Module:Yesno" (args[arg])
end
function p.scripts_in_block(frame)
local block_info = get_block_info_from_arg(frame.args, 1)
local show_block_name = get_boolean_from_arg(frame.args, 2)
local script_list = show_scripts(fun.range(block_info[1], block_info[2]))
if show_block_name then
return ("%s: %s"):format(block_info[3], script_list)
else
return script_list
end
end
local function link_block_name(block_name)
if block_name:find ' ' then
return ("[[%s]]"):format(block_name)
else
return ("[[%s (Unicode block)|%s]]"):format(block_name, block_name)
end
end
function p.scripts_in_blocks(frame)
local output = Output()
local start = frame.args[1] and tonumber(frame.args[1], 16) or 0
local ending = frame.args[2] and tonumber(frame.args[2], 16) or 0x4000
local script_data = mw.loadData "Module:Unicode data/scripts"
local singles = script_data.singles
local ranges = script_data.ranges
local counts = {}
setmetatable(counts, {
__index = {
increment = function(self, script_code, amount)
self[script_code] = (self[script_code] or 0) + (amount or 1)
end,
clear = function (self)
for _, key in ipairs(m_table.keysToList(self, false)) do
self[key] = nil
end
end,
}
})
output:insert [[
{| class="wikitable"
|+ Scripts in each Unicode block
! block !! codepoints !! scripts
]]
for _, block in pairs(mw.loadData "Module:Unicode data/blocks") do
local codepoint = block[1]
if codepoint > ending then break end
if codepoint >= start then
while codepoint <= block[2] do
if singles[codepoint] then
counts:increment(singles[codepoint])
codepoint = codepoint + 1
else
local range, index = Unicode_data.binary_range_search(codepoint, ranges)
if range then
local count = 0
while codepoint <= range[2] and codepoint <= block[2] do
codepoint = codepoint + 1
count = count + 1
end
counts:increment(range[3], count)
else -- Codepoint doesn't have data; it's Zzzz.
-- Get range immediately above codepoint.
while ranges[index][2] < codepoint do
index = index + 1
end
local count = 0
local range = ranges[index]
while codepoint < range[1] and codepoint <= block[2]
and not singles[codepoint] do
count = count + 1
codepoint = codepoint + 1
end
counts:increment("Zzzz", count)
end
end
end
output:insert_format([[
|-
| %s
| U+%04X–U+%04X
| %s
]], link_block_name(block[3]), block[1], block[2],
table.concat(
fun.map(
function (count, script)
return ("%s (%d)"):format(script, count)
end,
m_table.sortedPairs(
counts,
function (script1, script2)
return counts[script1] > counts[script2]
end)),
", "))
end
counts:clear()
end
output:insert "|}"
return output:join()
end
function p.chars_in_scripts_in_block(frame)
local block_info = get_block_info_from_arg(frame.args, 1)
local show_block_name = get_boolean_from_arg(frame.args, 2)
local script_char_set_map = print_char_set_map(
get_chars_in_scripts(fun.range(block_info[1], block_info[2])))
if show_block_name then
return ("%s: %s"):format(block_info[3], script_char_set_map)
else
return script_char_set_map
end
end
function p.search_for_language_codes(frame)
local page_name = frame.args[1] or "English language"
local success, title_object = pcall(mw.title.new, page_name)
if not (success and title_object) then
mw.logf("Could not make title object for '%s'.", page_name)
return
end
local content = title_object:getContent()
local language_codes = {}
for lang_template in content:gmatch '{{lang[^}]+' do
local template_name = lang_template:match('{{([^|}]+)')
local language_code
if template_name == 'lang' then
language_code = lang_template:match '{{lang|([^|}]+)'
elseif template_name:find '^lang-' then
language_code = lang_template:match '{{lang-([^|}]+)'
end
if language_code then
language_codes[language_code] = true
end
end
return table.concat(m_table.keysToList(language_codes), ', ')
end
return p