Module:Lang/data/iana languages/make: Difference between revisions

Content deleted Content added
No edit summary
m fix module names;
 
(18 intermediate revisions by 4 users not shown)
Line 1:
require('Module:No globalsstrict');
 
local p = {};
 
--[=[------------------------< G E T _ V A R I A N T _ P A R T S >---------------------------------------------
 
We get a record that looks more-or-less like this:
%%\n
Type: variant\n
Subtag: bohoric\n
Description: Slovene in Bohorič alphabet\n
Added: 2012-06-27\n
Prefix: sl\n
 
Each line is terminated with a \n character.
 
Type, for this function can only be 'variant'
 
Subtag is the code of Type
 
Prefix is a language code to which this variant applies; one language code per Prefix line. There can be
more than one prefix line.
 
Description associates Subtag with a proper name or names; one name per Description line. There can be more
than one Description line and Description lines can wrap to the next line. When they do, the first two
characters of the continuation line are spaces.
 
Comments: lines can also be continued so once in a Comments line (which is otherwise ignored) all further
continuations in the record are also ignored. This is a crude mechanism to prevent comment continuations
from being concatenated onto the end of descriptions and relies on Description line occuring in the record
before the Comments line.
 
Records with private use subtags are ignored.
 
]=]
 
local function get_variant_parts (record)
local code;
local descriptions = {};
local prefixes = {};
local in_comments = false;
 
if string.find (record, 'Deprecated', 1, true) or string.find (record, 'Preferred-Value', 1, true)
or string.find (record, 'Private use', 1, true) then
return 'skip';
end
 
for line in string.gmatch (record, '([^\n]+)\n') do -- get a \n terminated line of text (without the \n)
local label = string.match(line, "(.-):")
if not label and string.find (line, '^ .+') and not in_comments then -- if a continuation line but not a comments continuation
descriptions[#descriptions] = string.gsub (descriptions[#descriptions], '\"$', ''); -- remove trailing quote mark from previous description
descriptions[#descriptions] = descriptions[#descriptions] .. ' ' .. string.match (line, '^ (.+)') .. '\"'; -- extract and save the continuation with new quote mark
elseif label == 'Subtag' then -- if this line is the subtag line
code = string.match (line, 'Subtag: (%w+)'); -- extract and save to subtag's code
elseif label == 'Description' then -- if this line is a description line
local desc = string.match (line, 'Description: (.+)'); -- extract the description
desc = string.gsub (desc, '"', '\\"'); -- in case description contains quote marks (see 1959acad)
table.insert (descriptions, '\"' .. desc .. '\"'); -- save the description wrapped in quote marks
elseif label == 'Prefix' then -- if this line is a prefix line
table.insert (prefixes, '\"' .. string.match (line, 'Prefix: (.+)'):lower() .. '\"'); -- extract and save the prefix wrapped in quote marks
elseif label == 'Comments' then -- if this line is a comments line
in_comments = true;
end
end
return code, table.concat (prefixes, ', '), table.concat (descriptions, ', ');
end
 
 
--[=[------------------------< G E T _ L A N G _ S C R I P T _ R E G I O N _ P A R T S >-----------------------
 
We get ana record that looks more-or-less like this:
%%\n
Type: language\n
Line 28 ⟶ 93:
before the Comments line.
 
Records with Deprecatedprivate datesuse or Preferred-Value codessubtags are ignored as are private use codes.
 
]=]
Line 34 ⟶ 99:
local function get_lang_script_region_parts (record)
local code;
local suppress; -- Suppress script for this code if specified
local deprecated; -- boolean; true when subtag is deprecated
local descriptions = {};
local in_comments = false;
 
if mw.ustring.match (record, 'Deprecated') or mw.ustring.:find (record, 'Preferred%-Value') or mw.ustring.find (record, 'Private use') then
return 'skip';
end
 
for line in mw.ustring.record:gmatch (record, '([^\n]+)\n') do -- get a \n terminateterminated line of text (without the \n)
local label = line:match ('(.-):');
if mw.ustring.match (line, 'Subtag: [%a%d]+') then -- if this line is the subtag line
code = mw.ustring.match (line,if 'Subtag: ([%a%d]+)'); == label then -- extractif andthis saveline tois the subtag's codeline
elseif code mw.ustring.match= (line,:match ('DescriptionSubtag: .(%w+)') then; -- ifextract thisand linesave isto asubtag's description linecode
elseif 'Description' == label then -- if this line is a description line
table.insert (descriptions, '\"' .. mw.ustring.match (line, 'Description: (.+)') .. '\"'); -- extract and save the name wrapped in quote marks
table.insert (descriptions, '\"' .. line:match ('Description: (.+)') .. '\"'); -- extract and save the name wrapped in quote marks
elseif mw.ustring.match (line, 'Comments: .+') then -- if this line is a comments line
elseif 'Deprecated' == label then
deprecated = true; -- subtag is deprecated; set our flag
elseif 'Suppress-Script' == label then
suppress = line:match ('Suppress%-Script: (%S+)');
elseif 'Comments' == label then -- if this line is a comments line
in_comments = true;
elseif mw.ustring.matchline:find (line, '^ .+') and not in_comments then -- if a continuation line but not a commnets continuation
descriptions[#descriptions] = mw.ustring.gsub (descriptions[#descriptions],:gsub ('\"$', ''); -- remove trailing quote mark from previous description
descriptions[#descriptions] = descriptions[#descriptions] .. ' ' .. mw.ustring.line:match (line, '^ (.+)') .. '\"'; -- extract and save the continuation with new quote mark
end
end
return code, table.concat (descriptions, ', '), suppress, deprecated;
end
 
Line 61 ⟶ 133:
 
read a local copy of the IANA language-subtag-registry file and from it build tables to replace the tables in:
[[Module:LanguageLang/data/iana languages]]
[[Module:LanguageLang/data/iana scriptsregions]]
[[Module:LanguageLang/data/iana regionsscripts]]
[[Module:Lang/data/iana supressed cripts]]
[[Module:Lang/data/iana variants]]
 
current language-subtag-registry file can be found at: http://www.iana.org/assignments/language-subtag-registry
Line 70 ⟶ 144:
]=]
 
local function p.iana_extract (frame)
local page = mw.title.getCurrentTitle(); -- get a page object for this page
local content = page:getContent(); -- get unparsed content
local lang_table = {}; -- languages go here
local lang_dep_table = {}; -- deprecated languages go here
local script_table = {}; -- scripts go here
local region_table = {}; -- regions go here
local variant_table = {}; -- variants go here
local suppress_table = {}; -- here we collect suppressed scripts and associated language codes
local iso_639_1_table = {}; -- ISO 639-1 languages; not used by Module:Lang but included here to ensure Module:Lang/data/ISO_639-1 gets updated
local file_date; -- first line
 
local code;
local descriptions;
local prefixes; -- used for language variants only
local suppress; -- a code's suppress script
local deprecated; -- boolean: true when subtag is deprecated
 
file_date = content:match ('(File%-Date: %d%d%d%d%-%d%d%-%d%d)'); -- get the file date line from this version of the source file
 
for record in mw.ustringstring.gmatch (content, '%%%%([^%%]+)') do -- get a %% delimited 'record' from the file; leave off the delimiters
iflocal mwrecord_type = string.ustring.find match(record, 'Type: language(%w+)') then -- if a language record
if record_type == 'language' then -- if a language record
code, descriptions = get_lang_script_region_parts (record); -- get the code and description(s)
code, descriptions, suppress, deprecated = get_lang_script_region_parts (record); -- get the code, description(s), suppress script, and deprecated flag
if code and ('skip' ~= code) then
if deprecated then
table.insert (lang_table, "[\"" .. code .. "\"] = {" .. descriptions .. "}"); -- make table entries
table.insert (lang_dep_table, "[\"" .. code .. "\"] = {" .. descriptions .. "}"); -- make table entries
else
table.insert (lang_table, "[\"" .. code .. "\"] = {" .. descriptions .. "}"); -- make table entries
if 2 == code:len() then
table.insert (iso_639_1_table, "[\"" .. code .. "\"] = {" .. descriptions .. "}"); -- make table entries
end
end
elseif not code then
table.insert (lang_table, "[\"error\"] = {" .. record .. "}"); -- code should never be nil, but inserting an error entry in the final output can be helpful
end
-- here we collect suppress stript tags and their associated language codes;
-- prettigying the data in this table must wait until all language codes have been read
if suppress then -- if this code has a suppressed script
local suppressed_code = table.concat ({'\"', code, '\"'}); -- wrap the code in quotes
if suppress_table[suppress] then -- if there is an entry for this script
table.insert (suppress_table[suppress], suppressed_code); -- insert the new code
else
suppress_table[suppress] = {}; -- add new script and empty table
table.insert (suppress_table[suppress], suppressed_code); -- insert the new code
end
end
 
elseif mw.ustring.findrecord_type (record,== 'Type: script') then -- if a script record
code, descriptions = get_lang_script_region_parts (record); -- get the code and description(s)
Line 99 ⟶ 200:
table.insert (script_table, "[\"" .. code .. "\"] = {" .. descriptions .. "}"); -- make table entries
elseif not code then
table.insert (lang_tablescript_table, "[\"error\"] = {" .. record .. "}"); -- code should never be nil, but ...
end
 
elseif mw.ustring.findrecord_type (record,== 'Type: region') then -- if a region record
code, descriptions = get_lang_script_region_parts (record); -- get the code and description(s)
Line 108 ⟶ 209:
table.insert (region_table, "[\"" .. code .. "\"] = {" .. descriptions .. "}"); -- make table entries
elseif not code then
table.insert (lang_tableregion_table, "[\"error\"] = {" .. record .. "}"); -- code should never be nil, but ...
end
 
elseif record_type == 'variant' then -- if a variant record
code, prefixes, descriptions = get_variant_parts (record); -- get the code, prefix(es), and description(s)
 
if code and ('skip' ~= code) then
table.insert (variant_table,
table.concat ({
"[\"",
code,
"\"] = {<br />&#9;&#9;[\"descriptions\"] = {",
descriptions,
"},<br />&#9;&#9;[\"prefixes\"] = {",
prefixes,
"},<br />&#9;&#9;}"
})
);
elseif not code then
table.insert (variant_table, "[\"error\"] = {" .. record .. "}"); -- code should never be nil, but ...
end
end
end
-- makenow prettyprettify outputthe supressed script table
local pretty_suppressed = {};
return "<br /><pre>-- " .. file_date .. "<br />return {<br />&#9;" .. table.concat (lang_table, ',<br />&#9;') .. "<br />&#9;}<br />-- " ..
file_date .. "<br />return {<br />&#9;" .. table.concat (script_table, ',<br />&#9;') .. "<br />&#9;}<br />-- " ..
for script, code_tbl in pairs (suppress_table) do
file_date .. "<br />return {<br />&#9;" .. table.concat (region_table, ',<br />&#9;') .. "<br />&#9;}<br />" .. "</pre>";
local LIMIT = 11; -- max number of subtags on a line before a line break
local fragment_tbl = {}; -- groups of LIMIT number of subtags collected here
for i=1, #code_tbl, LIMIT do
local stop = ((i+LIMIT-1) > #code_tbl) and #code_tbl or i+LIMIT-1; -- calculate a table.concat stop position
table.insert (fragment_tbl, table.concat (code_tbl, ', ', i, stop)); -- get the fragment and save it
end
table.insert (pretty_suppressed, -- and make all pretty
table.concat ({'[\"', script, '\"] = {', table.concat (fragment_tbl, ',\n\t\t\t\t'), '}'})
);
end
table.sort (pretty_suppressed);
 
-- make final output pretty
return '<br /><pre>------------------------------< I A N A L A N G U A G E S >--------------------------------------------------<br />--' ..
file_date .. "<br />local active = {<br />&#9;" .. table.concat (lang_table, ',<br />&#9;') .. "<br />&#9;}<br /><br />" ..
"local deprecated = {<br />&#9;" .. table.concat (lang_dep_table, ',<br />&#9;') .. "<br />&#9;}<br /><br />" ..
"return {<br />&#9;active = active,<br />&#9;deprecated = deprecated,<br />&#9;}<br /><br />" ..
'------------------------------< I A N A S C R I P T S >------------------------------------------------------<br />--' ..
file_date .. "<br />return {<br />&#9;" .. table.concat (script_table, ',<br />&#9;') .. "<br />&#9;}<br /><br />" ..
'------------------------------< I A N A R E G I O N S >------------------------------------------------------<br />--' ..
file_date .. "<br />return {<br />&#9;" .. table.concat (region_table, ',<br />&#9;') .. "<br />&#9;}<br /><br />" ..
'------------------------------< I A N A V A R I A N T S >----------------------------------------------------<br />--' ..
file_date .. "<br />return {<br />&#9;" .. table.concat (variant_table, ',<br />&#9;') .. "<br />&#9;}<br /><br />" ..
'------------------------------< I A N A S U P P R E S S E D S C R I P T S >--------------------------------<br />--' ..
file_date .. "<br />return {<br />&#9;" .. table.concat (pretty_suppressed, ',<br />&#9;') .. "<br />&#9;}<br /><br />" ..
'------------------------------< I S O 6 3 9 - 1 >------------------------------------------------------------<br />--' ..
file_date .. "<br />return {<br />&#9;" .. table.concat (iso_639_1_table, ',<br />&#9;') .. "<br />&#9;}<br /><br />" .. "</pre>";
end
 
 
return p;
--[[--------------------------< E X P O R T E D F U N C T I O N >--------------------------------------------
]]
 
return {
iana_extract = iana_extract,
}