Module:Sandbox/Erutuon

This is an old revision of this page, as edited by Erutuon (talk | contribs) at 19:33, 1 July 2018 (gah). The present address (URL) is a permanent link to this revision, which may differ significantly from the current revision.

local p = {}

local Unicode_data = require 'Module:Unicode data'

local function errorf(level, ...)
	if type(level) == number then
		return error(string.format(...), level + 1)
	else -- level is actually the format string.
		return error(string.format(level, ...), 2)
	end
end

local Latn_pattern = table.concat {
	'[',																	-- this is a set so include opening bracket
	'\n\32-\127',															-- C0 Controls and Basic Latin			U+0020–U+007E (20 - 7E) + (U+0010 and U+007F <poem>...</poem> support)
	'\194\160-\194\172',													-- C1 Controls and Latin-1 Supplement	U+00A0-U+00AC (C2 A0 - C2 AC)
	'\195\128-\195\191',													-- (skip shy)							U+00C0–U+00FF (C3 80 - C3 BF)
	'\196\128-\197\191',													-- Latin Extended-A						U+0100–U+017F (C4 80 - C5 BF)
	'\198\128-\201\143',													-- Latin Extended-B						U+0180–U+024F (C6 80 - C9 8F)
	'\225\184\128-\225\187\191',											-- Latin Extended Additional			U+1E00-U+1EFF (E1 B8 80 - E1 BB BF)
	'\226\177\160-\226\177\191',											-- Latin Extended-C						U+2C60–U+2C7F (E2 B1 A0 - E2 B1 BF)
	'\234\156\160-\234\159\191',											-- Latin Extended-D						U+A720-U+A7FF (EA 9C A0 - EA 9F BF)
	'\234\172\176-\234\173\175',											-- Latin Extended-E						U+AB30-U+AB6F (EA AC B0 - EA AD AF)
	'\239\172\128-\239\172\134',											-- Alphabetic Presentaion Forms			U+FB00-U+FB06 (EF AC 80 - EF AC 86)
	'\239\188\129-\239\188\188',											-- Halfwidth and Fullwidth Forms		U+FF01-U+FF3C (EF BC 81 - EF BC BC)
	'–',																	-- ndash
	'—',																	-- mdash
	'«', '»',																-- guillemets commonly used in several 'Latn' languages
	']',																	-- close the set
};

local get_codepoint = mw.ustring.codepoint
local function expand_range(start, ending)
	local lower, higher = get_codepoint(start), get_codepoint(ending)
	if higher < lower then
		return nil
	end
	local chars = {}
	local i = 0
	for codepoint = lower, higher do
		i = i + 1
		chars[i] = mw.ustring.char(codepoint)
	end
	return table.concat(chars)
end

local fun = require "Module:Fun"
local m_table = require "Module:Table"

local script_to_count_mt = {
	__index = function (self, key)
		self[key] = 0
		return 0
	end,
	__call = function (self, ...)
		return setmetatable({}, self)
	end
}
setmetatable(script_to_count_mt, script_to_count_mt)

-- Uses an iterator (such as mw.ustring.gcodepoint) that generates a codepoint
-- each time it is called with an optional state and another value.
local function show_scripts(iterator, state, value)
	local script_to_count = script_to_count_mt()
	for codepoint in iterator, state, value do
		local script = Unicode_data.lookup_script(codepoint)
		script_to_count[script] = script_to_count[script] + 1
	end
	return table.concat(
		fun.mapIter(
			function (count, script)
				return ("%s (%d)"):format(script, count)
			end,
			m_table.sortedPairs(
				script_to_count,
				function (script1, script2)
					return script_to_count[script1] > script_to_count[script2]
				end)),
		", ")
end

function p.show(frame)
	local expanded_pattern = Latn_pattern
		:gsub('%[(.-)%]', '%1')
		:gsub( -- Find two UTF-8-encoded characters separated by hyphen-minus.
			'([%z\1-\127\194-\244][\128-\191]*)%-([%z\1-\127\194-\244][\128-\191]*)',
			function (char1, char2)
				return expand_range(char1, char2)
			end)
	
	local script_to_char_set = {}
	for char in expanded_pattern:gmatch "[%z\1-\127\194-\244][\128-\191]*" do
		local script = Unicode_data.lookup_script(get_codepoint(char))
		script_to_char_set[script] = script_to_char_set[script] or {}
		script_to_char_set[script][char] = true
	end
	
	--[[
	mw.log(
		table.concat(
			fun.mapIter(
				function (char_set, script)
					local char_list = m_table.keysToList(char_set)
					return ("%s: %s"):format(script, table.concat(char_list))
				end,
				m_table.sortedPairs(script_to_char_set)),
			'\n'))
	--]]
	
	return ('* <div style="overflow-wrap: break-word;">%s</div><br>%s')
		:format(expanded_pattern
			:gsub('^%s*', ''), -- Remove initial '\n ' to avoid creating unwanted pre element.
			show_scripts(mw.ustring.gcodepoint(expanded_pattern)))
end

function p.scripts_in_block(frame)
	local block_name = frame.args[1]
		or error("Parameter 1 is required")
	local low, high = Unicode_data.get_block_range(block_name)
	if not low then
		errorf("No block '%s' could be found", block_name)
	end
	local show_block_name = frame.args[2] and require "Module:Yesno" (frame.args[2])
	if show_block_name then
		return ("%s: %s"):format(block_name, show_scripts(fun.range(low, high)))
	else
		return ("%s"):format(show_scripts(fun.range(low, high)))
	end
end

return p