Module:Sandbox/Erutuon: Difference between revisions

Content deleted Content added
m all Wikipedia's private-use subtags have just one part
eliminated some redundant variables, making loops simpler; moved subtag list out of function; notes
Line 445:
})
-- An array of patterns for each subtag, and a "type" field for the name
-- of the subtag.
-- The patterns are checked in order, and any of the subtags can be skipped.
-- So, for example, the "language" subtag must precede the "script"
-- subtag, but a tag may contain a "language" subtag, no "script" subtag
-- and then a "region" subtag.
-- If the full list of subtags has been iterated over, the remaining subtags
-- must match the pattern for a private-use subtag, or the tag is invalid.
local subtag_info = { -- can be put in data module
{ "%a%a%a?", "1%a+", type = "language" }, -- ll or lll; special case
-- include extlang?
{ "%a%a%a%a", type = "script" }, -- Ssss
{ "%a%a", "%d%d%d", type = "region" }, -- rr, DDD
{
"%d%d%d%d", -- 4 digits
"%w%w%w%w%w%w?%w?%w?", -- 5-8 alnum characters
type = "variant",
repeatable = true, -- There can be multiple variants.
}
}
 
-- A previous draft, in [[Module:Lang/sandbox]]:
Line 481 ⟶ 501:
-- "invalid" is the portion of the tag after the last valid subtag (minus a
-- hyphen).
local potential_subtagssegments = mw.text.split(tag, "-")
local parsed_subtags = parsed_subtags_mt(potential_subtagssegments)
-- Language tags probably only contain ASCII alphabetic and numerical
Line 493 ⟶ 513:
return tag:find "[^A-Za-z0-9-]"
end,
potential_subtagssegments))
end
local subtag_i = 1 -- Index of current item in subtag_info.
-- An array of patterns for each subtag, and a "type" field for the name
local segment_i = 1 -- Index of current segment.
-- of the subtag.
while segments[segment_i] and subtag_info[subtag_i] do
-- The patterns are checked in order, and any of the subtags can be skipped.
local segment = segments[segment_i]
-- So, for example, the "language" subtag must precede the "script"
-- subtag, but a tag may contain a "language" subtag, no "script" subtag
-- and then a "region" subtag.
-- If the full list of subtags has been iterated over, the remaining subtags
-- must match the pattern for a private-use subtag, or the tag is invalid.
local subtag_info = { -- can be put in data module
{ "%a%a%a?", "1%a+", type = "language" }, -- ll or lll; special case
-- include extlang?
{ "%a%a%a%a", type = "script" }, -- Ssss
{ "%a%a", "%d%d%d", type = "region" }, -- rr, DDD
{
"%d%d%d%d", -- 4 digits
"%w%w%w%w%w%w?%w?%w?", -- 5-8 alnum characters
type = "variant"
}
}
local index = 1
local last_matched_subtag_i = 0
for subtag_i, subtag in ipairs(potential_subtags) do
local subtag_type
while not subtag_type and subtag_info[subtag_i] do
local matched = false
-- Check each pattern for the subtag type at "subtag_i" in "subtag_info".
while not matched do
local cur_subtag = subtag_info[subtag_i]
-- Check each pattern for the subtag type at "index" in "subtag_info".
for _, pattern in ipairs(subtag_info[index]cur_subtag) do
if subtagsegment:find("^" .. pattern .. "$") then
subtag_type = subtag_info[index]cur_subtag.type
matched = true
-- There can be multiple "variant" subtags (and "extension"
-- subtags, if those are added).
if subtag_typenot ~= "variant"cur_subtag.repeatable then
indexsubtag_i = indexsubtag_i + 1
end
break
end
end
if not matched then -- Go to next item in subtag_info.
if not subtag_type then -- No match; try next subtag.
index = index + 1
subtag_i = subtag_i + 1
if not subtag_info[index] then
break
end
end
end
-- If language subtag has not been found, or the current segment has not
if subtag_i == 1 and subtag_type ~= "language" then
-- been matched as a subtag, break the loop and check for
-- a private-use subtag.
if segment_i == 1 and subtag_type ~= "language" or not subtag_type then
break
else
elseif subtag_type then
if parsed_subtags[subtag_type] then -- Create an array.
if type(parsed_subtags[subtag_type]) == "string" then
parsed_subtags[subtag_type] = { parsed_subtags[subtag_type] }
end -- else table
table.insert(parsed_subtags[subtag_type], subtagsegment)
else
parsed_subtags[subtag_type] = subtagsegment
end
last_matched_segment_i = segment_i
last_matched_subtag_i = subtag_i
elseif not subtag_info[index] then
break
end
segment_i = segment_i + 1
end
if segments[segment_i] then -- More segments to scan?
if last_matched_subtag_i < #potential_subtags then
-- Not all potential subtags were matched. Check for private-use subtags.
-- https://tools.ietf.org/html/bcp47#section-2.2.7
Line 567:
-- alphanumeric characters preceded by "x-".
-- Alphanumericity has already been checked.
if potential_subtags[last_matched_subtag_i + 1]
-- A tag must start with either a language subtag or a private-use subtag.
and potential_subtags[last_matched_subtag_i + 1]:lower() ~= "x" then
-- If next segment is not "x", introducing a private-use subtag, there
-- is no private-use subtag.
if segments[segment_i] and segments[segment_i]:lower() ~= "x" then
if not parsed_subtags.language then
return parsed_subtags:throw("no_language", 1)
else
return parsed_subtags:throw("invalid_subtag",
segment_i)
last_matched_subtag_i + 1)
end
elseif not segments[segment_i + 1] then
end
-- Check length of all following subtags.
if not potential_subtags[last_matched_subtag_i + 2] then
return parsed_subtags:throw("empty_private_use",
segment_i)
last_matched_subtag_i + 1)
end
-- Check length of all segments after "x".
for i = last_matched_subtag_i + 2, #potential_subtags do
for i = segment_i + 1, #segments do
local length = #potential_subtags[i]
local length = #segments[i]
if not (1 <= length and length <= 8) then
return parsed_subtags
:throw("invalid_private_use", segment_i)
last_matched_subtag_i + 1)
end
end
if not potential_subtagssegments[last_matched_subtag_ilast_matched_segment_i + 3] then -- There is only one private-use subtag.
parsed_subtags.private_use = potential_subtagssegments[last_matched_subtag_isegment_i + 21]
else
parsed_subtags.private_use = {}
for i = last_matched_subtag_isegment_i + 21, #potential_subtagssegments do
table.insert(parsed_subtags.private_use, potential_subtagssegments[i])
end
end
Line 616:
-- everything else is lowercase.
-- Check existence of language tag.
if parsed_subtags.language and
not (lang_data.override[parsed_subtags.language]
Line 622 ⟶ 623:
end
-- Check existence of script tag.
if parsed_subtags.script then
iflocal notlower_script = lang_name_table.script[parsed_subtags.script:lower()] then
if not lang_name_table.script[lower_script] then
mw.log("Invalid script code", parsed_subtags.script, "in", parsed_subtags:get_tag())
end
-- Check that script tag is not marked as superfluous (because the
local lower_script = parsed_subtags.script:lower()
-- it is considered the default one for the language).
if lang_name_table.suppressed[lower_script]
and parsed_subtags.language
and m_table.inArray(
lang_name_table.suppressed[lower_script],
Line 636 ⟶ 641:
end
-- Check existence of region code..
if parsed_subtags.region and not lang_name_table.region[parsed_subtags.region:lower()] then
mw.log("Invalid region code", parsed_subtags.region, "in", parsed_subtags:get_tag())
end
-- Check that variant code is valid, and that it can validly be used with the
-- given combination of language, script, region, and variant.
-- Check for duplicate variant subtags?
if parsed_subtags.variant then
local lower_tag = parsed_subtags:get_tag():lower()
Line 660 ⟶ 669:
end
-- Check that the private-use subtag is actually used by Wikipedia.
if parsed_subtags.private_use and not (type(parsed_subtags.private_use) == "string"
and lang_data.override[parsed_subtags.tag]) then