This is a crude tool that reads a local copy of an IANA language-subtag-registry file and extracts the information necessary to create the data tables held by:
- Module:Lang/data/iana languages
- Module:Lang/data/iana scripts
- Module:Lang/data/iana regions
- Module:Lang/data/iana variants
- Module:Lang/data/iana suppressed scripts
- Module:Lang/data/ISO 639-1 – not an IANA file per se, but used by Module:ISO 639 name; included here so that -1 is not neglected when the other files are updated
The tool skips records that contain the words: 'Deprecated', 'Preferred-Value', and 'Private use'.
At this writing, the tool extracts only the subtag code and description(s) from language, script, region, and variant records.
Usage
To use this tool:
- Open a blank sandbox page and paste the following at the top:
{{#invoke:Language/data/iana languages/make|iana_extract}}
- Go to the current language-subtag-registry file (or any of the files held by archive.org). Copy the whole (or just as much as you need) and paste it into the sandbox page below the
{{#invoke:}}
. - Click Show preview
- Wait
- Copy result
There is some crude error checking that will insert an error message in the output. No guarantees that such messaging will be helpful. Search for the word 'error' in the tool's output.
p = {};
--[=[------------------------< G E T _ E L E M E N T _ P A R T S >---------------------------------------------
We get an element that looks more-or-less like this:
%%\n
Type: language\n
Subtag: aa\n
Description: Afar\n
Added: 2005-10-16\n
Each line is terminated with a \n character.
Type, for our purposes can be 'language', 'script', or 'region'
Subtag is the code of Type
Description associates Subtag with a proper name or names. There can be more than one Description line and
Description lines can wrap to the next line. When they do, the first two characters of the continuation line
are spaces.
]=]
local function get_lang_script_region_parts (element)
local code = code;
local descriptions = {};
for line in mw.ustring.gmatch (element, '([^\n]+)\n') do
if mw.ustring.match (line, 'Subtag: [%a%d]+') then
code = mw.ustring.match (line, 'Subtag: ([%a%d]+)');
elseif mw.ustring.match (line, 'Description: .+') then
table.insert (descriptions, '\"' .. mw.ustring.match (line, 'Description: (.+)') .. '\"');
elseif mw.ustring.match (line, '^ .+') then
descriptions[#descriptions] = mw.ustring.gsub (descriptions[#descriptions], '\"$', ''); -- remove trailing quote mark
descriptions[#descriptions] = descriptions[#descriptions] .. ' ' .. mw.ustring.match (line, '^ (.+)') .. '\"';
end
end
return code, table.concat (descriptions, ', ');
end
--[=[------------------------< I A N A _ E X T R A C T _ L A N G >---------------------------------------------
read a local copy of the IANA language-subtag-registry file and from it build the rudiments of a table to replace
the table in [[Module:Language/data/iana languages]].
%%
Type: language
Subtag: aa
Description: Afar
]=]
function p.iana_extract_lang (frame)
local page = mw.title.getCurrentTitle(); -- get a page object for this page
local content = page:getContent(); -- get unparsed content
local lang_table = {}; -- languages go here
local script_table = {}; -- scripts go here
local region_table = {}; -- regions go here
local file_date; -- first line
local code;
local descriptions;
-- search for nvr links and associated hull numbers
_, _, file_date = content:find ('(File%-Date: %d%d%d%d%-%d%d%-%d%d)');
for element in mw.ustring.gmatch (content, '%%%%[^%%]+') do
if mw.ustring.find (element, 'Type: language') then
code, descriptions = get_lang_script_region_parts (element);
if code then
table.insert (lang_table, "[\"" .. code .. "\"] = {" .. descriptions .. "}"); -- make table entries
end
elseif mw.ustring.find (element, 'Type: script') then
code, descriptions = get_lang_script_region_parts (element);
if code then
table.insert (script_table, "[\"" .. code .. "\"] = {" .. descriptions .. "}"); -- make table entries
end
elseif mw.ustring.find (element, 'Type: region') then
code, descriptions = get_lang_script_region_parts (element);
if code then
table.insert (region_table, "[\"" .. code .. "\"] = {" .. descriptions .. "}"); -- make table entries
end
end
end
-- make pretty output
return "<br /><pre>-- " .. file_date .. "<br />return {<br />	" .. table.concat (lang_table, ',<br />	') .. "<br />	}<br />-- " ..
file_date .. "<br />return {<br />	" .. table.concat (script_table, ',<br />	') .. "<br />	}<br />-- " ..
file_date .. "<br />return {<br />	" .. table.concat (region_table, ',<br />	') .. "<br />	}<br />" .. "</pre>";
end
return p;