Wikipedia:WikiProject Interlanguage Links/Scripts: Difference between revisions

Content deleted Content added
Topbanana (talk | contribs)
m Created page with 'Into a mysaql database load the ''page'' and ''langlinks'' database dump files from two or more wikipedia Wikipedia:Database_download. Extra...'
 
Topbanana (talk | contribs)
mNo edit summary
 
(7 intermediate revisions by the same user not shown)
Line 1:
Into a [[mysaqlmysql]] database load the ''page'' and ''langlinks'' database dump files from two or more wikipedia [[database dumps|Wikipedia:Database_download]].
 
Extract interlanguage links into a single table for anaylysis
 
----
<pre>
CREATE TABLE inter (
from_lang varchar(10),
from_title varchar(255),
to_lang varchar(10),
to_title varchar(255)
);
 
USE en;
 
Line 29:
... etc
</pre>
----
 
Tidy up this table and index it:
 
----
<pre>
 
 
 
update inter set from_title = replace( from_title, '_', ' ' );
update inter set to_title = replace( to_title, '_', ' ' );
Line 48 ⟶ 47:
ALTER TABLE inter ADD INDEX ( to_lang, to_title );
</pre>
----
 
Extract the information required, ie:
 
----
And extract suggested red links, ie:
<pre>
// suggested reciporcal links
CREATE TABLE suggestions (
from_lang varchar(10),
from_title varchar(255),
to_lang varchar(10),
to_title varchar(255)
);
 
INSERT INTO suggestions
SELECT DISTINCT a.from_lang, a.from_title, b.to_lang, b.to_title
FROM inter a
INNER JOIN inter b
ON a.to_lang = b.from_lang
AND a.to_title = b.from_title
LEFT JOIN inter c
ON c.from_lang = a.from_lang
AND c.from_title = a.from_title
AND c.to_lang = b.to_lang // Not c.to_title - any link from a to c existsing is sufficient
WHERE a.from_lang = 'en'
AND b.to_lang != a.from_lang
AND c.from_lang IS NULL;
 
// a:x -> b:x and b:x exists and no link from b:? to a:x exists => b:x -> a:x
 
INSERT INTO suggestions
SELECT concat( '*[[', a.to_title, ']] &rarr; [[:', a.from_lang, ':', a.from_title, ']]' )
SELECT a.to_lang, a.to_title, a.from_lang, a.from_title
FROM inter a
INNER JOIN en.page p
Line 85 ⟶ 66:
AND b.from_title = a.to_title
AND b.to_lang = a.from_lang
WHERE a.from_lang =IN ('de','es','fr','it','nl','ja','pl','pt','sv' )
AND a.to_lang = 'en'
AND b.from_lang IS NULL;
 
// Interlanguage links to pages that do not exist
// en -> fr only
INSERT INTO suggestions
SELECT a.from_lang, a.from_title, a.to_lang, a.to_title
FROM inter a
INNER JOIN en.page f
ON f.page_title = a.from_title
AND f.page_namespace = 0
AND f.page_is_redirect = 0
LEFT JOIN fr.page t
ON t.page_title = a.to_title
AND t.page_namespace = 0
WHERE a.from_lang = 'en'
AND a.to_lang = 'fr'
AND t.page_title IS NULL
 
 
// Interlanguage links to redirects
// en -> fr only
 
SELECT a.from_lang, a.from_title, a.to_lang, a.to_title
FROM inter a
INNER JOIN en.page f
ON f.page_title = a.from_title
AND f.page_namespace = 0
AND f.page_is_redirect = 0
INNER JOIN fr.page t
ON a.to_title = t.page_title
AND t.page_namespace = 0
AND t.page_is_redirect = 1
WHERE a.from_lang = 'en'
AND a.to_lang = 'fr';
</pre>
----
 
Finally, extract these suggestions in a human-readable format
----
<pre>
DROP PROCEDURE IF EXISTS report_suggestions;
DELIMITER //
 
CREATE PROCEDURE report_suggestions( group_size INT, flang VARCHAR(10), tlang VARCHAR(10) )
BEGIN
DECLARE sug_pos, sug_base, done INT;
DECLARE ftitle, ttitle VARCHAR(255);
DECLARE sug CURSOR FOR SELECT DISTINCT from_title, to_title FROM suggestions WHERE from_lang = flang AND to_lang = tlang;
DECLARE CONTINUE HANDLER FOR SQLSTATE '23000' SET done = 1;
 
SET sug_pos = 0;
SET sug_base = 0;
SET done = 0;
OPEN sug;
 
REPEAT
 
FETCH sug INTO ftitle, ttitle;
 
IF NOT done THEN
IF sug_pos = 0 THEN
SELECT concat( '=== ', sug_base, ' - ', sug_base + group_size - 1, ' ===' );
SET sug_base = sug_base + group_size;
SET sug_pos = group_size - 1;
ELSE
SET sug_pos = sug_pos - 1;
END IF;
 
SELECT concat( '*[[', ftitle, ']] &rarr; [[:', tlang, ':', ttitle, ']]' );
END IF;
 
UNTIL done END REPEAT;
CLOSE sug;
END;
//
 
DELIMITER ;
 
call report_suggestions( 10, 'en', 'sv' );
</pre>