Revision as of 00:26, 7 July 2018 edit Erutuon (talk \| contribs) Autopatrolled, Extended confirmed users 32,238 edits m all Wikipedia's private-use subtags have just one part ← Previous edit		Revision as of 01:44, 7 July 2018 edit undo Erutuon (talk \| contribs) Autopatrolled, Extended confirmed users 32,238 edits eliminated some redundant variables, making loops simpler; moved subtag list out of function; notes Next edit →
Line 445: }) -- An array of patterns for each subtag, and a "type" field for the name -- of the subtag. -- The patterns are checked in order, and any of the subtags can be skipped. -- So, for example, the "language" subtag must precede the "script" -- subtag, but a tag may contain a "language" subtag, no "script" subtag -- and then a "region" subtag. -- If the full list of subtags has been iterated over, the remaining subtags -- must match the pattern for a private-use subtag, or the tag is invalid. local subtag_info = { -- can be put in data module { "%a%a%a?", "1%a+", type = "language" }, -- ll or lll; special case -- include extlang? { "%a%a%a%a", type = "script" }, -- Ssss { "%a%a", "%d%d%d", type = "region" }, -- rr, DDD { "%d%d%d%d", -- 4 digits "%w%w%w%w%w%w?%w?%w?", -- 5-8 alnum characters type = "variant", repeatable = true, -- There can be multiple variants. } } -- A previous draft, in [[Module:Lang/sandbox]]: Line 481 ⟶ 501: -- "invalid" is the portion of the tag after the last valid subtag (minus a -- hyphen). local ~~potential_subtags~~segments = mw.text.split(tag, "-") local parsed_subtags = parsed_subtags_mt(~~potential_subtags~~segments) -- Language tags probably only contain ASCII alphabetic and numerical Line 493 ⟶ 513: return tag:find "[^A-Za-z0-9-]" end, ~~potential_subtags~~segments)) end local subtag_i = 1 -- Index of current item in subtag_info. ~~-- An array of patterns for each subtag, and a "type" field for the name~~ local segment_i = 1 -- Index of current segment. ~~-- of the subtag.~~ while segments[segment_i] and subtag_info[subtag_i] do ~~-- The patterns are checked in order, and any of the subtags can be skipped.~~ local segment = segments[segment_i] ~~-- So, for example, the "language" subtag must precede the "script"~~ ~~-- subtag, but a tag may contain a "language" subtag, no "script" subtag~~ ~~-- and then a "region" subtag.~~ ~~-- If the full list of subtags has been iterated over, the remaining subtags~~ ~~-- must match the pattern for a private-use subtag, or the tag is invalid.~~ ~~local subtag_info = { -- can be put in data module~~ ~~{ "%a%a%a?", "1%a+", type = "language" }, -- ll or lll; special case~~ ~~-- include extlang?~~ ~~{ "%a%a%a%a", type = "script" }, -- Ssss~~ ~~{ "%a%a", "%d%d%d", type = "region" }, -- rr, DDD~~ { ~~"%d%d%d%d", -- 4 digits~~ ~~"%w%w%w%w%w%w?%w?%w?", -- 5-8 alnum characters~~ ~~type = "variant"~~ } } ~~local index = 1~~ ~~local last_matched_subtag_i = 0~~ ~~for subtag_i, subtag in ipairs(potential_subtags) do~~ local subtag_type while not subtag_type and subtag_info[subtag_i] do ~~local matched = false~~ -- Check each pattern for the subtag type at "subtag_i" in "subtag_info". ~~while not matched do~~ local cur_subtag = subtag_info[subtag_i] ~~-- Check each pattern for the subtag type at "index" in "subtag_info".~~ for _, pattern in ipairs(~~subtag_info[index]~~cur_subtag) do if ~~subtag~~segment:find("^" .. pattern .. "$") then subtag_type = ~~subtag_info[index]~~cur_subtag.type ~~matched = true~~ -- There can be multiple "variant" subtags (and "extension" -- subtags, if those are added). if ~~subtag_type~~not ~~~= "variant"~~cur_subtag.repeatable then ~~index~~subtag_i = ~~index~~subtag_i + 1 end break end end ~~if not matched then -- Go to next item in subtag_info.~~ if not subtag_type then -- No match; try next subtag. ~~index = index + 1~~ subtag_i = subtag_i + 1 ~~if not subtag_info[index] then~~ ~~break~~ ~~end~~ end end -- If language subtag has not been found, or the current segment has not ~~if subtag_i == 1 and subtag_type ~= "language" then~~ -- been matched as a subtag, break the loop and check for -- a private-use subtag. if segment_i == 1 and subtag_type ~= "language" or not subtag_type then break else ~~elseif subtag_type then~~ if parsed_subtags[subtag_type] then -- Create an array. if type(parsed_subtags[subtag_type]) == "string" then parsed_subtags[subtag_type] = { parsed_subtags[subtag_type] } end -- else table table.insert(parsed_subtags[subtag_type], ~~subtag~~segment) else parsed_subtags[subtag_type] = ~~subtag~~segment end last_matched_segment_i = segment_i ~~last_matched_subtag_i = subtag_i~~ ~~elseif not subtag_info[index] then~~ ~~break~~ end segment_i = segment_i + 1 end if segments[segment_i] then -- More segments to scan? ~~if last_matched_subtag_i < #potential_subtags then~~ -- Not all potential subtags were matched. Check for private-use subtags. -- https://tools.ietf.org/html/bcp47#section-2.2.7 Line 567: -- alphanumeric characters preceded by "x-". -- Alphanumericity has already been checked. ~~if potential_subtags[last_matched_subtag_i + 1]~~ -- A tag must start with either a language subtag or a private-use subtag. ~~and potential_subtags[last_matched_subtag_i + 1]:lower() ~= "x" then~~ -- If next segment is not "x", introducing a private-use subtag, there -- is no private-use subtag. if segments[segment_i] and segments[segment_i]:lower() ~= "x" then if not parsed_subtags.language then return parsed_subtags:throw("no_language", 1) else return parsed_subtags:throw("invalid_subtag", segment_i) ~~last_matched_subtag_i + 1)~~ end elseif not segments[segment_i + 1] then ~~end~~ ~~-- Check length of all following subtags.~~ ~~if not potential_subtags[last_matched_subtag_i + 2] then~~ return parsed_subtags:throw("empty_private_use", segment_i) ~~last_matched_subtag_i + 1)~~ end -- Check length of all segments after "x". ~~for i = last_matched_subtag_i + 2, #potential_subtags do~~ for i = segment_i + 1, #segments do ~~local length = #potential_subtags[i]~~ local length = #segments[i] if not (1 <= length and length <= 8) then return parsed_subtags :throw("invalid_private_use", segment_i) ~~last_matched_subtag_i + 1)~~ end end if not ~~potential_subtags~~segments[~~last_matched_subtag_i~~last_matched_segment_i + 3] then -- There is only one private-use subtag. parsed_subtags.private_use = ~~potential_subtags~~segments[~~last_matched_subtag_i~~segment_i + 21] else parsed_subtags.private_use = {} for i = ~~last_matched_subtag_i~~segment_i + 21, #~~potential_subtags~~segments do table.insert(parsed_subtags.private_use, ~~potential_subtags~~segments[i]) end end Line 616: -- everything else is lowercase. -- Check existence of language tag. if parsed_subtags.language and not (lang_data.override[parsed_subtags.language] Line 622 ⟶ 623: end -- Check existence of script tag. if parsed_subtags.script then iflocal ~~not~~lower_script = ~~lang_name_table.script[~~parsed_subtags.script:lower()~~] then~~ if not lang_name_table.script[lower_script] then mw.log("Invalid script code", parsed_subtags.script, "in", parsed_subtags:get_tag()) end -- Check that script tag is not marked as superfluous (because the ~~local lower_script = parsed_subtags.script:lower()~~ -- it is considered the default one for the language). if lang_name_table.suppressed[lower_script] and parsed_subtags.language and m_table.inArray( lang_name_table.suppressed[lower_script], Line 636 ⟶ 641: end -- Check existence of region code.. if parsed_subtags.region and not lang_name_table.region[parsed_subtags.region:lower()] then mw.log("Invalid region code", parsed_subtags.region, "in", parsed_subtags:get_tag()) end -- Check that variant code is valid, and that it can validly be used with the -- given combination of language, script, region, and variant. -- Check for duplicate variant subtags? if parsed_subtags.variant then local lower_tag = parsed_subtags:get_tag():lower() Line 660 ⟶ 669: end -- Check that the private-use subtag is actually used by Wikipedia. if parsed_subtags.private_use and not (type(parsed_subtags.private_use) == "string" and lang_data.override[parsed_subtags.tag]) then

Module:Sandbox/Erutuon: Difference between revisions