Module:Lang/data/iana languages/make: Difference between revisions

Browse history interactively

← Previous edit

Content deleted Content added

Revision as of 15:28, 4 November 2017 edit Trappist the monk (talk \| contribs) Administrators 494,447 edits No edit summary ← Previous edit		Latest revision as of 14:55, 10 July 2024 edit undo Trappist the monk (talk \| contribs) Administrators 494,447 edits m fix module names;
(18 intermediate revisions by 4 users not shown)
Line 1: require('~~Module:No globals~~strict'); ~~local p = {};~~ --[=[------------------------< G E T _ V A R I A N T _ P A R T S >--------------------------------------------- We get a record that looks more-or-less like this: %%\n Type: variant\n Subtag: bohoric\n Description: Slovene in Bohorič alphabet\n Added: 2012-06-27\n Prefix: sl\n Each line is terminated with a \n character. Type, for this function can only be 'variant' Subtag is the code of Type Prefix is a language code to which this variant applies; one language code per Prefix line. There can be more than one prefix line. Description associates Subtag with a proper name or names; one name per Description line. There can be more than one Description line and Description lines can wrap to the next line. When they do, the first two characters of the continuation line are spaces. Comments: lines can also be continued so once in a Comments line (which is otherwise ignored) all further continuations in the record are also ignored. This is a crude mechanism to prevent comment continuations from being concatenated onto the end of descriptions and relies on Description line occuring in the record before the Comments line. Records with private use subtags are ignored. ]=] local function get_variant_parts (record) local code; local descriptions = {}; local prefixes = {}; local in_comments = false; if string.find (record, 'Deprecated', 1, true) or string.find (record, 'Preferred-Value', 1, true) or string.find (record, 'Private use', 1, true) then return 'skip'; end for line in string.gmatch (record, '([^\n]+)\n') do -- get a \n terminated line of text (without the \n) local label = string.match(line, "(.-):") if not label and string.find (line, '^ .+') and not in_comments then -- if a continuation line but not a comments continuation descriptions[#descriptions] = string.gsub (descriptions[#descriptions], '\"$', ''); -- remove trailing quote mark from previous description descriptions[#descriptions] = descriptions[#descriptions] .. ' ' .. string.match (line, '^ (.+)') .. '\"'; -- extract and save the continuation with new quote mark elseif label == 'Subtag' then -- if this line is the subtag line code = string.match (line, 'Subtag: (%w+)'); -- extract and save to subtag's code elseif label == 'Description' then -- if this line is a description line local desc = string.match (line, 'Description: (.+)'); -- extract the description desc = string.gsub (desc, '"', '\\"'); -- in case description contains quote marks (see 1959acad) table.insert (descriptions, '\"' .. desc .. '\"'); -- save the description wrapped in quote marks elseif label == 'Prefix' then -- if this line is a prefix line table.insert (prefixes, '\"' .. string.match (line, 'Prefix: (.+)'):lower() .. '\"'); -- extract and save the prefix wrapped in quote marks elseif label == 'Comments' then -- if this line is a comments line in_comments = true; end end return code, table.concat (prefixes, ', '), table.concat (descriptions, ', '); end --[=[------------------------< G E T _ L A N G _ S C R I P T _ R E G I O N _ P A R T S >----------------------- We get ana record that looks more-or-less like this: %%\n Type: language\n Line 28 ⟶ 93: before the Comments line. Records with ~~Deprecated~~private ~~dates~~use ~~or Preferred-Value codes~~subtags are ignored ~~as are private use codes~~. ]=] Line 34 ⟶ 99: local function get_lang_script_region_parts (record) local code; local suppress; -- Suppress script for this code if specified local deprecated; -- boolean; true when subtag is deprecated local descriptions = {}; local in_comments = false; if ~~mw.ustring.match (~~record~~, 'Deprecated') or mw.ustring.~~:find (~~record, 'Preferred%-Value') or mw.ustring.find (record,~~ 'Private use') then return 'skip'; end for line in ~~mw.ustring.~~record:gmatch (~~record,~~ '([^\n]+)\n') do -- get a \n ~~terminate~~terminated line of text (without the \n) local label = line:match ('(.-):'); ~~if mw.ustring.match (line, 'Subtag: [%a%d]+') then -- if this line is the subtag line~~ ~~code = mw.ustring.match (line,~~if 'Subtag~~: ([%a%d]+)~~'); == label then -- ~~extract~~if ~~and~~this ~~save~~line tois the subtag's ~~code~~line ~~elseif~~ code ~~mw.ustring.match~~= (line,:match ('~~Description~~Subtag: .(%w+)') ~~then~~; -- ifextract ~~this~~and ~~line~~save isto asubtag's ~~description line~~code elseif 'Description' == label then -- if this line is a description line ~~table.insert (descriptions, '\"' .. mw.ustring.match (line, 'Description: (.+)') .. '\"'); -- extract and save the name wrapped in quote marks~~ table.insert (descriptions, '\"' .. line:match ('Description: (.+)') .. '\"'); -- extract and save the name wrapped in quote marks ~~elseif mw.ustring.match (line, 'Comments: .+') then -- if this line is a comments line~~ elseif 'Deprecated' == label then deprecated = true; -- subtag is deprecated; set our flag elseif 'Suppress-Script' == label then suppress = line:match ('Suppress%-Script: (%S+)'); elseif 'Comments' == label then -- if this line is a comments line in_comments = true; elseif ~~mw.ustring.match~~line:find (~~line,~~ '^ .+') and not in_comments then -- if a continuation line but not a commnets continuation descriptions[#descriptions] = ~~mw.ustring.gsub (~~descriptions[#descriptions],:gsub ('\"$', ''); -- remove trailing quote mark from previous description descriptions[#descriptions] = descriptions[#descriptions] .. ' ' .. ~~mw.ustring.~~line:match (~~line,~~ '^ (.+)') .. '\"'; -- extract and save the continuation with new quote mark end end return code, table.concat (descriptions, ', '), suppress, deprecated; end Line 61 ⟶ 133: read a local copy of the IANA language-subtag-registry file and from it build tables to replace the tables in: [[Module:~~Language~~Lang/data/iana languages]] [[Module:~~Language~~Lang/data/iana ~~scripts~~regions]] [[Module:~~Language~~Lang/data/iana ~~regions~~scripts]] [[Module:Lang/data/iana supressed cripts]] [[Module:Lang/data/iana variants]] current language-subtag-registry file can be found at: http://www.iana.org/assignments/language-subtag-registry Line 70 ⟶ 144: ]=] local function p.iana_extract (frame) local page = mw.title.getCurrentTitle(); -- get a page object for this page local content = page:getContent(); -- get unparsed content local lang_table = {}; -- languages go here local lang_dep_table = {}; -- deprecated languages go here local script_table = {}; -- scripts go here local region_table = {}; -- regions go here local variant_table = {}; -- variants go here local suppress_table = {}; -- here we collect suppressed scripts and associated language codes local iso_639_1_table = {}; -- ISO 639-1 languages; not used by Module:Lang but included here to ensure Module:Lang/data/ISO_639-1 gets updated local file_date; -- first line local code; local descriptions; local prefixes; -- used for language variants only local suppress; -- a code's suppress script local deprecated; -- boolean: true when subtag is deprecated file_date = content:match ('(File%-Date: %d%d%d%d%-%d%d%-%d%d)'); -- get the file date line from this version of the source file for record in ~~mw.ustring~~string.gmatch (content, '%%%%([^%%]+)') do -- get a %% delimited 'record' from the file; leave off the delimiters iflocal mwrecord_type = string.~~ustring.find~~ match(record, 'Type: ~~language~~(%w+)') ~~then -- if a language record~~ if record_type == 'language' then -- if a language record ~~code, descriptions = get_lang_script_region_parts (record); -- get the code and description(s)~~ code, descriptions, suppress, deprecated = get_lang_script_region_parts (record); -- get the code, description(s), suppress script, and deprecated flag if code and ('skip' ~= code) then if deprecated then ~~table.insert (lang_table, "[\"" .. code .. "\"] = {" .. descriptions .. "}"); -- make table entries~~ table.insert (lang_dep_table, "[\"" .. code .. "\"] = {" .. descriptions .. "}"); -- make table entries else table.insert (lang_table, "[\"" .. code .. "\"] = {" .. descriptions .. "}"); -- make table entries if 2 == code:len() then table.insert (iso_639_1_table, "[\"" .. code .. "\"] = {" .. descriptions .. "}"); -- make table entries end end elseif not code then table.insert (lang_table, "[\"error\"] = {" .. record .. "}"); -- code should never be nil, but inserting an error entry in the final output can be helpful end -- here we collect suppress stript tags and their associated language codes; -- prettigying the data in this table must wait until all language codes have been read if suppress then -- if this code has a suppressed script local suppressed_code = table.concat ({'\"', code, '\"'}); -- wrap the code in quotes if suppress_table[suppress] then -- if there is an entry for this script table.insert (suppress_table[suppress], suppressed_code); -- insert the new code else suppress_table[suppress] = {}; -- add new script and empty table table.insert (suppress_table[suppress], suppressed_code); -- insert the new code end end elseif ~~mw.ustring.find~~record_type ~~(record,~~== '~~Type:~~ script') then -- if a script record code, descriptions = get_lang_script_region_parts (record); -- get the code and description(s) Line 99 ⟶ 200: table.insert (script_table, "[\"" .. code .. "\"] = {" .. descriptions .. "}"); -- make table entries elseif not code then table.insert (~~lang_table~~script_table, "[\"error\"] = {" .. record .. "}"); -- code should never be nil, but ... end elseif ~~mw.ustring.find~~record_type ~~(record,~~== '~~Type:~~ region') then -- if a region record code, descriptions = get_lang_script_region_parts (record); -- get the code and description(s) Line 108 ⟶ 209: table.insert (region_table, "[\"" .. code .. "\"] = {" .. descriptions .. "}"); -- make table entries elseif not code then table.insert (~~lang_table~~region_table, "[\"error\"] = {" .. record .. "}"); -- code should never be nil, but ... end elseif record_type == 'variant' then -- if a variant record code, prefixes, descriptions = get_variant_parts (record); -- get the code, prefix(es), and description(s) if code and ('skip' ~= code) then table.insert (variant_table, table.concat ({ "[\"", code, "\"] = {<br /> [\"descriptions\"] = {", descriptions, "},<br /> [\"prefixes\"] = {", prefixes, "},<br /> }" }) ); elseif not code then table.insert (variant_table, "[\"error\"] = {" .. record .. "}"); -- code should never be nil, but ... end end end -- ~~make~~now ~~pretty~~prettify ~~output~~the supressed script table local pretty_suppressed = {}; ~~return "<br /><pre>-- " .. file_date .. "<br />return {<br /> " .. table.concat (lang_table, ',<br /> ') .. "<br /> }<br />-- " ..~~ ~~file_date .. "<br />return {<br /> " .. table.concat (script_table, ',<br /> ') .. "<br /> }<br />-- " ..~~ for script, code_tbl in pairs (suppress_table) do ~~file_date .. "<br />return {<br /> " .. table.concat (region_table, ',<br /> ') .. "<br /> }<br />" .. "</pre>";~~ local LIMIT = 11; -- max number of subtags on a line before a line break local fragment_tbl = {}; -- groups of LIMIT number of subtags collected here for i=1, #code_tbl, LIMIT do local stop = ((i+LIMIT-1) > #code_tbl) and #code_tbl or i+LIMIT-1; -- calculate a table.concat stop position table.insert (fragment_tbl, table.concat (code_tbl, ', ', i, stop)); -- get the fragment and save it end table.insert (pretty_suppressed, -- and make all pretty table.concat ({'[\"', script, '\"] = {', table.concat (fragment_tbl, ',\n\t\t\t\t'), '}'}) ); end table.sort (pretty_suppressed); -- make final output pretty return '<br /><pre>------------------------------< I A N A L A N G U A G E S >--------------------------------------------------<br />--' .. file_date .. "<br />local active = {<br /> " .. table.concat (lang_table, ',<br /> ') .. "<br /> }<br /><br />" .. "local deprecated = {<br /> " .. table.concat (lang_dep_table, ',<br /> ') .. "<br /> }<br /><br />" .. "return {<br /> active = active,<br /> deprecated = deprecated,<br /> }<br /><br />" .. '------------------------------< I A N A S C R I P T S >------------------------------------------------------<br />--' .. file_date .. "<br />return {<br /> " .. table.concat (script_table, ',<br /> ') .. "<br /> }<br /><br />" .. '------------------------------< I A N A R E G I O N S >------------------------------------------------------<br />--' .. file_date .. "<br />return {<br /> " .. table.concat (region_table, ',<br /> ') .. "<br /> }<br /><br />" .. '------------------------------< I A N A V A R I A N T S >----------------------------------------------------<br />--' .. file_date .. "<br />return {<br /> " .. table.concat (variant_table, ',<br /> ') .. "<br /> }<br /><br />" .. '------------------------------< I A N A S U P P R E S S E D S C R I P T S >--------------------------------<br />--' .. file_date .. "<br />return {<br /> " .. table.concat (pretty_suppressed, ',<br /> ') .. "<br /> }<br /><br />" .. '------------------------------< I S O 6 3 9 - 1 >------------------------------------------------------------<br />--' .. file_date .. "<br />return {<br /> " .. table.concat (iso_639_1_table, ',<br /> ') .. "<br /> }<br /><br />" .. "</pre>"; end ~~return p;~~ --[[--------------------------< E X P O R T E D F U N C T I O N >-------------------------------------------- ]] return { iana_extract = iana_extract, }