This is a crude tool that reads a local copy of an IANA language-subtag-registry file and extracts the information necessary to create the data tables held by:
- Module:Lang/data/iana languages
- Module:Lang/data/iana scripts
- Module:Lang/data/iana regions
- Module:Lang/data/iana variants
- Module:Lang/data/iana suppressed scripts
- Module:Lang/data/ISO 639-1 – not an IANA file per se, but used by Module:ISO 639 name; included here so that -1 is not neglected when the other files are updated
The tool skips records that contain the words: 'Deprecated', 'Preferred-Value', and 'Private use'.
At this writing, the tool extracts only the subtag code and description(s) from language, script, region, and variant records.
Usage
To use this tool:
- Open a blank sandbox page and paste the following at the top:
{{#invoke:Language/data/iana languages/make|iana_extract}}
- Go to the current language-subtag-registry file (or any of the files held by archive.org). Copy the whole (or just as much as you need) and paste it into the sandbox page below the
{{#invoke:}}
. - Click Show preview
- Wait
- Copy result
There is some crude error checking that will insert an error message in the output. No guarantees that such messaging will be helpful. Search for the word 'error' in the tool's output.
require('Module:No globals');
local p = {};
--[=[------------------------< G E T _ L A N G _ S C R I P T _ R E G I O N _ P A R T S >-----------------------
We get an element that looks more-or-less like this:
%%\n
Type: language\n
Subtag: aa\n
Description: Afar\n
Added: 2005-10-16\n
Each line is terminated with a \n character.
Type, for our purposes can be 'language', 'script', or 'region'
Subtag is the code of Type
Description associates Subtag with a proper name or names; one name per Description line. There can be more
than one Description line and Description lines can wrap to the next line. When they do, the first two
characters of the continuation line are spaces.
]=]
local function get_lang_script_region_parts (element)
local code;
local descriptions = {};
for line in mw.ustring.gmatch (element, '([^\n]+)\n') do -- get a \n terminate line of text (without the \n)
if mw.ustring.match (line, 'Subtag: [%a%d]+') then -- if this line is the subtag line
code = mw.ustring.match (line, 'Subtag: ([%a%d]+)'); -- extract and save to subtag's code
elseif mw.ustring.match (line, 'Description: .+') then -- if this line is a description line
table.insert (descriptions, '\"' .. mw.ustring.match (line, 'Description: (.+)') .. '\"'); -- extract and save the name wrapped in quote marks
elseif mw.ustring.match (line, '^ .+') then -- if a continuation line
descriptions[#descriptions] = mw.ustring.gsub (descriptions[#descriptions], '\"$', ''); -- remove trailing quote mark from previous description
descriptions[#descriptions] = descriptions[#descriptions] .. ' ' .. mw.ustring.match (line, '^ (.+)') .. '\"'; -- extract and save the continuation with new quote mark
end
end
return code, table.concat (descriptions, ', ');
end
--[=[------------------------< I A N A _ E X T R A C T >-------------------------------------------------------
read a local copy of the IANA language-subtag-registry file and from it build tables to replace the tables in:
[[Module:Language/data/iana languages]]
[[Module:Language/data/iana scripts]
[[Module:Language/data/iana regions]]
current language-subtag-registry file can be found at: http://www.iana.org/assignments/language-subtag-registry
archive.org has copies of previous versions see: https://web.archive.org/web/*/http://www.iana.org/assignments/language-subtag-registry
]=]
function p.iana_extract (frame)
local page = mw.title.getCurrentTitle(); -- get a page object for this page
local content = page:getContent(); -- get unparsed content
local lang_table = {}; -- languages go here
local script_table = {}; -- scripts go here
local region_table = {}; -- regions go here
local file_date; -- first line
local code;
local descriptions;
file_date = content:match ('(File%-Date: %d%d%d%d%-%d%d%-%d%d)'); -- get the file date line from this version of the source file
for element in mw.ustring.gmatch (content, '%%%%([^%%]+)') do -- get a %% delimited 'element' from the file; leave off the delimiters
if mw.ustring.find (element, 'Type: language') then -- if a language element
code, descriptions = get_lang_script_region_parts (element); -- get the code and description(s)
if code then
table.insert (lang_table, "[\"" .. code .. "\"] = {" .. descriptions .. "}"); -- make table entries
else
table.insert (lang_table, "[\"error\"] = {}"); -- code should never be nil, but inserting an error entry in the final output can be helpful
end
elseif mw.ustring.find (element, 'Type: script') then -- if a script element
code, descriptions = get_lang_script_region_parts (element); -- get the code and description(s)
if code then
table.insert (script_table, "[\"" .. code .. "\"] = {" .. descriptions .. "}"); -- make table entries
else
table.insert (lang_table, "[\"error\"] = {}"); -- code should never be nil, but ...
end
elseif mw.ustring.find (element, 'Type: region') then -- if a region element
code, descriptions = get_lang_script_region_parts (element); -- get the code and description(s)
if code then
table.insert (region_table, "[\"" .. code .. "\"] = {" .. descriptions .. "}"); -- make table entries
else
table.insert (lang_table, "[\"error\"] = {}"); -- code should never be nil, but ...
end
end
end
-- make pretty output
return "<br /><pre>-- " .. file_date .. "<br />return {<br />	" .. table.concat (lang_table, ',<br />	') .. "<br />	}<br />-- " ..
file_date .. "<br />return {<br />	" .. table.concat (script_table, ',<br />	') .. "<br />	}<br />-- " ..
file_date .. "<br />return {<br />	" .. table.concat (region_table, ',<br />	') .. "<br />	}<br />" .. "</pre>";
end
return p;