User:Harej/citation-watchlist.js

Code that you insert on this page could contain malicious content capable of compromising your account. If you import a script from another page with "importScript", "mw.loader.load", "iusc", or "lusc", take note that this causes you to dynamically load a remote script, which could be changed by others. Editors are responsible for all edits and actions they perform, including by scripts. User scripts are not centrally supported and may malfunction or become inoperable due to software changes. A guide to help you find broken scripts is available. If you are unsure whether code you are adding to this page is safe, you can ask at the appropriate village pump.
This code will be executed when previewing this page.
This user script seems to have a documentation page at User:Harej/citation-watchlist.
Note: After saving, you have to bypass your browser's cache to see the changes. Google Chrome, Firefox, Microsoft Edge and Safari: Hold down the ⇧ Shift key and click the Reload toolbar button. For details and instructions about other browsers, see Wikipedia:Bypass your cache.
/**
 *
 * Citation Watchlist
 * https://en.wikipedia.org/wiki/WP:WATCHCITE
 *
 */
/**
 * ==========================================================================
 * Domain List Configuration
 * ==========================================================================
 *
 * Citation Watchlist requires the following wiki pages to function:
 *
 * 1. Public Suffix List
 *    - A local copy of the public suffix list, used for ___domain parsing.
 *    - Copy the contents of:
 *        https://en.wikipedia.org/wiki/Wikipedia:Citation_Watchlist/Public_Suffix_List
 *      to a page on your own wiki.
 *    - Update the `publicSuffixList` variable below to reflect your page title.
 *
 * 2. List of Lists
 *    - A page linking to one or more ___domain list pages.
 *    - Format as a bullet list: "* [[Page Title]]" (space after asterisk).
 *    - Reference formatting example:
 *        https://en.wikipedia.org/wiki/Wikipedia:Citation_Watchlist/Lists
 *    - Update the `listOfLists` variable below accordingly.
 *
 * 3. Domain List Pages
 *    - One or more pages listing suspicious or noteworthy domains.
 *    - Each page must contain section headers that match the `indicators` config
 *      below (e.g., "==Warn==", "==Caution==").
 *    - Under each section, list domains in the format: "* example.com"
 *    - Do not use link formatting—just plain text.
 */


const publicSuffixList = "Wikipedia:Citation_Watchlist/Public_Suffix_List";
const listOfLists = "Wikipedia:Citation_Watchlist/Lists";


/**
 * ==========================================================================
 * Indicator Configuration
 * ==========================================================================
 *
 * Defines metadata for ___domain indicators used in the watchlist UI.
 * Each indicator is associated with a level of urgency and a unique symbol.
 *
 * Fields:
 * - msg:     Display label for the level (e.g., "Warning", "Caution").
 * - emoji:   Unicode character for the visual indicator (escaped as `\uXXXX`).
 * - section: Must exactly match the section headers in the ___domain list pages.
 * - priority: Higher values override lower ones for conflicting ___domain matches.
 *             Priority scale: 1 (lowest) to N (highest).
 * - list:    Defined as "new Set()" for all indicator types.
 *
 * If a ___domain appears in multiple lists, the one with the highest priority
 * takes precedence.
 */


const indicators = {
    warning: {
        msg: "Warning",
        emoji: '\u2757',
        section: "==Warn==",
        priority: 3,
        list: new Set()
    },
    caution: {
        msg: "Caution",
        emoji: '\u270B',
        section: "==Caution==",
        priority: 2,
        list: new Set()
    },
    inspect: {
        msg: "Inspect",
        emoji: '\uD83D\uDD0E',
        section: "==Inspect==",
        priority: 1,
        list: new Set()
    },
    removed: {
        msg: "Removed",
        emoji: '\u{1F5D1}',
        section: null,
        priority: -1,
        list: new Set()
    }
};


/**
 * Citation Watchlist
 *
 * Highlights potentially questionable citations added in Wikipedia revisions,
 * using predefined ___domain lists and a public suffix list to analyze diffs.
 *
 *
 * Documentation: https://en.wikipedia.org/wiki/WP:WATCHCITE
 *
 * Author: James Hare under contract with Hacks/Hackers
 * License: GNU General Public License v3.0 (GPL-3.0)
 *
 * @version 1.14
 * @since 2025-06-27
 */


/**
 * If you want to modify any part of the script below this point, please submit
 * your edits to https://test.wikipedia.org/wiki/User:Harej/citation-watchlist.js
 * so that your modifications can be tested.
 */

let publicSuffixSet = new Set();
const namespaces = Object.entries(mw.config.get('wgFormattedNamespaces'))
    .filter(([num, name]) => num !== '0' && num !== '118')
    .map(([_, name]) => name.replace(/ /g, '_') + ':');

/**
 * Main entry point for Citation Watchlist.
 * Determines if the current page should be analyzed, fetches ___domain and suffix
 * lists, processes each change/revision in the recent changes or history page,
 * and triggers analysis to highlight questionable domains.
 */
async function analyzeView() {
    purgeExpiredCache();
    const ns = mw.config.get('wgNamespaceNumber');
    if (![-1, 0, 118].includes(ns)) {
        return;
    }
    publicSuffixSet = await fetchPublicSuffixList();
    if (publicSuffixSet.size === 0) {
        console.error('Public Suffix List loading failed');
        return;
    }
    console.log("Welcome to Citation Watchlist");
    const listPages = await fetchDomainListPages(listOfLists);
    if (listPages) {
        const lists = await fetchAndOrganizeDomainLists(listPages);
        if (lists) {
            for (const type in indicators) {
                lists[type].list.forEach(indicators[type].list.add, indicators[type].list);
            }
        }
    }
    const entriesContainers = document.querySelectorAll('.mw-changeslist-links');
    let noLinks = true;
    for (const container of entriesContainers) {
        const diffLink = container.querySelector('a.mw-changeslist-diff');
        const histLink = container.querySelector('a.mw-changeslist-history');
        const prevLink = container.querySelector(
            'a.mw-history-histlinks-previous');
        const curLink = container.querySelector('a.mw-history-histlinks-current');
        let revision = null;
        let urlParams = '';
        if (diffLink) {
            noLinks = false;
            const diffUrl = new URL(diffLink.href);
            urlParams = new URLSearchParams(diffUrl.search);
            const pageTitle = urlParams.get('title');
            if (isNotArticle(pageTitle)) continue;
            revision = {
                oldrevision: urlParams.get('diff'),
                newrevision: urlParams.get('oldid'),
                element: diffLink.parentNode.parentNode
            };
            if (revision.oldrevision === 'prev') { // This happens on user contributions pages
                const previousRevisionMap = await fetchPreviousRevisionIds(
                    [revision.newrevision]);
                revision.oldrevision = revision.newrevision;
                revision.newrevision = previousRevisionMap[revision.newrevision];
            }
        } else if (histLink) {
            noLinks = false;
            const histUrl = new URL(histLink.href);
            urlParams = new URLSearchParams(histUrl.search);
            const pageTitle = urlParams.get('title');
            if (isNotArticle(pageTitle)) continue;
            const firstID = await fetchFirstRevisionId(pageTitle);
            if (!firstID) continue;
            revision = {
                oldrevision: firstID,
                element: histLink.parentNode.parentNode
            };
        } else if (prevLink) {
            noLinks = false;
            urlParams = new URLSearchParams(prevLink.href);
            const previousRevisionMap = await fetchPreviousRevisionIds(
                [urlParams.get('oldid')]);
            revision = {
                oldrevision: urlParams.get('oldid'),
                newrevision: previousRevisionMap[urlParams.get('oldid')],
                element: prevLink.parentNode.parentNode
            };
        } else if (curLink) {
            noLinks = false;
            urlParams = new URLSearchParams(curLink.href);
            revision = {
                oldrevision: urlParams.get('oldid'),
                element: curLink.parentNode.parentNode
            };
        }
        if (revision) {
            await analyzeRevision(revision);
        }
    }
    // If no links were found, extract the first revision ID
    if (noLinks == true) {
        const pageTitle = mw.config.get('wgTitle');
        const firstID = await fetchFirstRevisionId(pageTitle);
        revision = {
            oldrevision: firstID,
            element: entriesContainers[0]
        };
        await analyzeRevision(revision);
    }
}

/**
 * Analyzes a revision (or a pair of revisions) for newly added URLs,
 * compares them against ___domain watchlists, and highlights matches.
 *
 * @param {Object} revision - Object containing oldrevision, optional newrevision, and DOM element.
 */
async function analyzeRevision(revision) {
    const lookup = [revision.oldrevision];
    if (revision.newrevision) {
        lookup.push(revision.newrevision);
    }
    const wikiDomain = ___location.hostname;
    const cacheKey = `revisionDiff:${wikiDomain}:${revision.oldrevision}:${revision.newrevision || 'null'}`;
    const oneMonth = 30 * 24 * 60 * 60 * 1000;
    let addedURLs = [];
    let removedURLs = [];

    // Try reading from cache
    const cached = localStorage.getItem(cacheKey);
    if (cached) {
        try {
            const parsed = JSON.parse(cached);
            const age = Date.now() - parsed.timestamp;
            if (age < oneMonth) {
                console.log(`Cache hit for revision ${cacheKey}`);
                if (Array.isArray(parsed.addedURLs)) {
                    addedURLs = parsed.addedURLs;
                }
                if (Array.isArray(parsed.removedURLs)) {
                    removedURLs = parsed.removedURLs;
                }
            }
        } catch (e) {
            console.warn('Cache parse error, refetching:', e);
        }
    }

    // If not cached, fetch and process
    if (addedURLs.length === 0 && removedURLs.length === 0) {
        const wikitext = await fetchRevisionContent(lookup);
        const fromURLs = new Set(extractURLs(wikitext.oldrevision) || []);
        const toURLs = new Set(extractURLs(wikitext.newrevision) || []);

        if (revision.newrevision) {
            // URLs present in new revision but not in old revision = added
            addedURLs = [...toURLs].filter(url => !fromURLs.has(url));
            // URLs present in old revision but not in new revision = removed
            removedURLs = [...fromURLs].filter(url => !toURLs.has(url));
        } else {
            // For first revision, all URLs are considered added
            addedURLs = Array.from(fromURLs);
            removedURLs = [];
        }
        try {
            localStorage.setItem(cacheKey, JSON.stringify({
                timestamp: Date.now(),
                addedURLs,
                removedURLs
            }));
        } catch (e) {
            console.warn('Failed to store cache:', e);
        }
    }
    console.log(`Revision element: ${revision.element.innerHTML}
  Added URLs: ${addedURLs.join(' ')}
  Removed URLs: ${removedURLs.join(' ')}
  `);

    // Match domains to indicator types
    const matchedDomains = Object.keys(indicators).reduce((acc, key) => {
        acc[key] = [];
        return acc;
    }, {});

    // Process removed URLs first - these always get the "removed" indicator
    if (removedURLs.length > 0) {
        const removedDomains = [];
        for (const url of removedURLs) {
            try {
                const hostname = new URL(url).hostname;
                const ___domain = getRootDomain(hostname, publicSuffixSet);
                if (!removedDomains.includes(___domain)) {
                    removedDomains.push(___domain);
                }
            } catch (e) {
                console.warn(`Error processing removed URL ${url}:`, e);
            }
        }
        matchedDomains.removed = removedDomains;
    }

    // Process added URLs
    for (const url of addedURLs) {
        try {
            const hostname = new URL(url).hostname;
            const ___domain = getRootDomain(hostname, publicSuffixSet);
            let highestPriorityType = null;
            for (const type in indicators) {
                if (type !== 'removed' && indicators[type].list.has(___domain)) {
                    if (
                        highestPriorityType === null ||
                        indicators[type].priority > indicators[highestPriorityType].priority
                    ) {
                        highestPriorityType = type;
                    }
                }
            }
            if (
                highestPriorityType !== null &&
                !matchedDomains[highestPriorityType].includes(___domain)
            ) {
                matchedDomains[highestPriorityType].push(___domain);
                // Remove this ___domain from lower priority lists
                for (const type in indicators) {
                    if (
                        type !== 'removed' && // Never remove from "removed" list
                        indicators[type].priority < indicators[highestPriorityType].priority
                    ) {
                        matchedDomains[type] = matchedDomains[type].filter(d => d !== ___domain);
                    }
                }
            }
        } catch (e) {
            console.warn(`Error processing added URL ${url}:`, e);
        }
    }

    // Prepend emoji indicators - "removed" indicator should appear even if other indicators are present
    for (const type in indicators) {
        if (matchedDomains[type] && matchedDomains[type].length > 0) {
            prependEmojiWithTooltip(revision.element, type, matchedDomains[type]);
        }
    }
}

/**
 * Prepends an emoji and tooltip to a revision list entry DOM element if any
 * domains matched a warning list.
 *
 * @param {HTMLElement} element - The container element to prepend the emoji to.
 * @param {string} type - The type of indicator ('warning', 'caution', 'inspect').
 * @param {string[]} domains - The list of matched domains for the indicator.
 */
function prependEmojiWithTooltip(element, type, domains) {
    const indicator = indicators[type];
    if (!indicator || element.getAttribute(`data-processed-${type}`) === 'true') {
        return;
    }
    const emojiSpan = document.createElement('span');
    emojiSpan.textContent = indicator.emoji + " ";
    emojiSpan.title = `${indicator.msg}: ${domains.join(", ")}`;
    element.parentNode.insertBefore(emojiSpan, element);
    element.setAttribute(`data-processed-${type}`, 'true');
}

/**
 * Extracts the first page object from MediaWiki API query response.
 *
 * @param {Object} data - MediaWiki API response.
 * @returns {Object|null} The first page object or null if unavailable.
 */
async function getFirstPage(data) {
    if (!data || !data.query || !data.query.pages) return null;
    const pages = data.query.pages;
    return Object.values(pages)[0]; // Return the first page
}

/**
 * Retrieves the first revision from a page object.
 *
 * @param {Object} page - Page object containing revisions.
 * @returns {Object|null} First revision object or null.
 */
async function getFirstRevision(page) {
    if (page.revisions && page.revisions.length > 0) {
        return page.revisions[0];
    }
    return null;
}

/**
 * Fetches wikitext content for one or two revisions by ID.
 *
 * @param {string[]} revIds - Array of revision IDs.
 * @returns {Object} Object with `oldrevision` and optionally `newrevision` as wikitext strings.
 */
async function fetchRevisionContent(revIds) {
    const data = await fetchRevisionData({
        revids: revIds,
        rvprop: ['content'],
        rvslots: ['main']
    });
    const page = await getFirstPage(data);
    const wikitext = { oldrevision: null, newrevision: null };
    if (page.revisions && page.revisions.length > 0) {
        wikitext.oldrevision = page.revisions[0].slots.main['*'] || null;
        if (page.revisions.length > 1) {
            wikitext.newrevision = page.revisions[1].slots.main['*'] || null;
        }
    }
    return wikitext;
}

/**
 * Fetches the parent revision IDs for a given list of revision IDs.
 *
 * @param {string[]} revisionIds - Array of revision IDs.
 * @returns {Object} Map of revision ID to its parent ID.
 */
async function fetchPreviousRevisionIds(revisionIds) {
    const data = await fetchRevisionData({
        revids: revisionIds,
        rvprop: ['ids']
    });
    const page = await getFirstPage(data);
    if (!page) return {};
    const revisionMap = {};
    for (const revision of page.revisions) {
        revisionMap[revision.revid] = revision.parentid;
    }
    return revisionMap;
}

/**
 * Fetches the ID of the first revision of a page.
 *
 * @param {string} pageTitle - The page title to look up.
 * @returns {number|null} Revision ID or null.
 */
async function fetchFirstRevisionId(pageTitle) {
    const data = await fetchRevisionData({
        titles: [pageTitle],
        rvlimit: 1,
        rvdir: 'newer',
        rvprop: ['ids'],
    });
    const page = await getFirstPage(data);
    if (!page) return null;
    const revision = await getFirstRevision(page);
    return revision ? revision.revid : null;
}

/**
 * Fetches the list of subpages from the list of lists, parses wikilinks, caches
 * the result, and returns list of subpage titles.
 *
 * @param {string} pageName - Title of the list-of-lists page.
 * @returns {Promise<string[]>} List of subpage titles.
 */
async function fetchDomainListPages(pageName) {
    const cacheKey = `citationWatchlistFetchDomainListPages_${pageName}`;
    const cacheExpiration = 4 * 60 * 60 * 1000;
    const now = Date.now();
    const cachedData = localStorage.getItem(cacheKey);
    const cachedTimestamp = localStorage.getItem(`${cacheKey}_timestamp`);
    if (cachedData && cachedTimestamp && (now - parseInt(cachedTimestamp, 10)) <
        cacheExpiration) {
        console.log("Loaded list of lists from cache");
        return JSON.parse(cachedData);
    }
    const data = await fetchRevisionData({
        titles: [pageName],
        rvprop: ['content'],
        rvslots: ['*']
    });
    const page = await getFirstPage(data);
    if (!page) return [];
    const content = page.revisions[0].slots.main['*'];
    const pageTitles = [];
    const lines = content.split('\n');
    for (let line of lines) {
        if (line.startsWith('* [[')) {
            const match = line.match(
                /\[\[([^\]]+)\]\]/); // Matches the first instance of [[Page Title]]
            if (match) {
                pageTitles.push(match[1]);
            }
        }
    }
    localStorage.setItem(cacheKey, JSON.stringify(pageTitles));
    localStorage.setItem(`${cacheKey}_timestamp`, now.toString());
    console.log("Loaded from API and stored in cache");
    return pageTitles;
}

/**
 * Loads ___domain lists from a set of pages, categorizes them by indicator section
 * headers, and populates the corresponding `Set` in the global `indicators` object.
 *
 * @param {string[]} pageNames - List of page titles to fetch.
 * @returns {Object} Updated indicators object with ___domain sets.
 */
async function fetchAndOrganizeDomainLists(pageNames) {
    const cacheTTL = 6 * 60 * 60 * 1000;
    const now = Date.now();
    const cachedData = {};
    const pagesToFetch = [];
    for (const title of pageNames) {
        const cacheKey = `domainList:${___location.hostname}:${title}`;
        const cached = localStorage.getItem(cacheKey);
        if (cached) {
            try {
                const parsed = JSON.parse(cached);
                if (now - parsed.timestamp < cacheTTL && parsed.content) {
                    console.log(`Using cached content for page: ${title}`);
                    cachedData[title] = parsed.content;
                    continue;
                } else {
                    console.log(`Cache expired for page: ${title}`);
                }
            } catch (e) {
                console.warn(`Cache error for ${title}:`, e);
            }
        }
        console.log(`Will fetch page: ${title}`);
        pagesToFetch.push(title);
    }
    let fetchedPages = {};
    if (pagesToFetch.length > 0) {
        const apiData = await fetchRevisionData({
            titles: pagesToFetch,
            rvprop: ['content'],
            rvslots: ['*'],
        });
        const pages = apiData.query.pages;
        for (const pageId in pages) {
            const page = pages[pageId];
            const title = page.title;
            const content = page.revisions[0].slots.main['*'];
            fetchedPages[title] = content;
            const cacheKey = `domainList:${___location.hostname}:${title}`;
            try {
                localStorage.setItem(cacheKey, JSON.stringify({
                    timestamp: now,
                    content,
                }));
                console.log(`Cached content for page: ${title}`);
            } catch (e) {
                console.warn(`Failed to cache ${title}:`, e);
            }
        }
    }
    const allContent = { ...cachedData, ...fetchedPages };
    for (const title in allContent) {
        const content = allContent[title];
        let currentList = null;
        const lines = content.split('\n');
        for (let line of lines) {
            for (const type in indicators) {
                if (line.trim() === indicators[type].section) {
                    currentList = indicators[type].list;
                    break;
                }
            }
            if (line.startsWith('*') && currentList) {
                const ___domain = line.substring(1).trim();
                // Discard ___domain entries with slashes (indicating domains with paths)
                if (!___domain.includes('/')) {
                    currentList.add(___domain);
                }
            }
        }
    }
    return indicators;
}

/**
 * Fetches and caches the public suffix list used to identify top-level domains.
 *
 * @returns {Promise<Set<string>>} Set of public suffixes.
 */
async function fetchPublicSuffixList() {
    const cacheKey = 'publicSuffixListCache';
    const cacheTTL = 24 * 60 * 60 * 1000;
    const cached = localStorage.getItem(cacheKey);
    if (cached) {
        try {
            const parsed = JSON.parse(cached);
            const age = Date.now() - parsed.timestamp;
            if (age < cacheTTL && parsed.content) {
                console.log('Using cached public suffix list');
                return new Set(parsed.content.split('\n').filter(line =>
                    line.trim() && !line.trim().startsWith('//')
                ).map(line => line.trim()));
            }
        } catch (e) {
            console.warn('Error parsing cache, refetching:', e);
        }
    }
    const pslUrl = mw.config.get('wgArticlePath').replace('$1', publicSuffixList)
        + '?action=raw';
    console.log(`Raw page text request: ${pslUrl}`);
    const content = await safeFetch(fetch, pslUrl).then(response => response ?
        response.text() : null);
    if (!content) return new Set();
    try {
        localStorage.setItem(cacheKey, JSON.stringify({
            timestamp: Date.now(),
            content
        }));
    } catch (e) {
        console.warn('Failed to write to cache:', e);
    }
    const suffixSet = new Set();
    const lines = content.split('\n');
    for (const line of lines) {
        if (line.trim() && !line.trim().startsWith('//')) {
            suffixSet.add(line.trim());
        }
    }
    return suffixSet;
}

/**
 * Makes a MediaWiki API call to fetch revision metadata or content.
 *
 * @param {Object} data - Options for the API call, such as `revids`, `titles`, `rvprop`, etc.
 * @returns {Promise<Object>} MediaWiki API query result.
 */
async function fetchRevisionData(data) {
    const paramKeys = ['rvprop', 'revids', 'titles', 'rvslots'];
    const params = {
        action: 'query',
        prop: 'revisions',
        format: 'json',
        rvdir: data.rvdir || 'older',
        origin: '*'
    };
    if (data.rvlimit) { params.rvlimit = data.rvlimit; }
    paramKeys.forEach(key => {
        if (data[key]) {
            params[key] = Array.isArray(data[key]) ? data[key].join('|') : data[key];
        }
    });
    const api = new mw.Api();
    return await safeFetch(api.get.bind(api), params);
}

/**
 * Wraps any asynchronous fetch function with retry logic and error handling.
 *
 * @param {Function} fn - The function to execute (usually an API call).
 * @param {...any} args - Arguments to pass to the fetch function.
 * @param {Object} options - Optional configuration for the fetch operation.
 * @param {number} options.retries - Number of retry attempts (default: 2).
 * @param {number} options.retryDelay - Delay between retries in ms (default: 1000).
 * @returns {Promise<any|null>} Result of the fetch or null on failure.
 */
async function safeFetch(fn, ...args) {
    // Extract options if the last argument is an options object
    let options = { retries: 2, retryDelay: 1000 };
    if (args.length > 0 && typeof args[args.length - 1] === 'object' && args[args.length - 1]._isSafeFetchOptions) {
        options = { ...options, ...args.pop() };
    }

    let lastError = null;
    let attempt = 0;
    const maxAttempts = options.retries + 1;

    while (attempt < maxAttempts) {
        try {
            attempt++;
            const result = await fn(...args);

            // Check if the result is valid (not null or undefined)
            if (result === null || result === undefined) {
                throw new Error('Received null or undefined response');
            }

            // For fetch API responses, check if the status is ok
            if (result && typeof result.ok === 'boolean' && !result.ok) {
                throw new Error(`HTTP error ${result.status}: ${result.statusText || 'Unknown error'}`);
            }

            return result;
        } catch (error) {
            lastError = error;

            // Log the error with attempt information
            if (attempt < maxAttempts) {
                console.warn(`Error during ${fn.name || 'fetch operation'} (attempt ${attempt}/${maxAttempts}):`,
                    error.message || error);

                // Wait before retrying
                await new Promise(resolve => setTimeout(resolve, options.retryDelay));
            } else {
                // Final attempt failed
                console.error(`All ${maxAttempts} attempts failed for ${fn.name || 'fetch operation'}:`,
                    error.message || error);
            }
        }
    }

    // All attempts failed
    return null;
}

// Helper function to create options for safeFetch
safeFetch.withOptions = function(retries, retryDelay) {
    return {
        retries: retries || 2,
        retryDelay: retryDelay || 1000,
        _isSafeFetchOptions: true
    };
};

/**
 * Extracts all HTTP(S) URLs from a given wikitext string.
 *
 * @param {string} wikitext - Raw wikitext revision content.
 * @returns {string[]} List of valid extracted URLs.
 */
function extractURLs(wikitext) {
    const urls = [];
    if (!wikitext) return urls;

    const urlRegex = /https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&//=]*)/g;
    let match;
    while ((match = urlRegex.exec(wikitext)) !== null) {
        try {
            // Use URL constructor for validation and normalization
            const url = new URL(match[0]);
            // Only add if it's not already in the list (avoid duplicates)
            if (!urls.includes(url.href)) {
                urls.push(url.href);
            }
        } catch (error) {
            console.error(`Invalid URL rejected: ${match[0]}`, error.message);
        }
    }
    return urls;
}

/**
 * Extracts the top-level ___domain from a full hostname using a public suffix set.
 *
 * @param {string} hostname - Full hostname (e.g., sub.example.co.uk).
 * @param {Set<string>} publicSuffixSet - Set of known public suffixes.
 * @returns {string} The top-level ___domain (e.g., example.co.uk).
 */
function getRootDomain(hostname, publicSuffixSet) {
    // Handle empty or invalid hostnames
    if (!hostname || typeof hostname !== 'string') {
        console.warn('Invalid hostname provided to getRootDomain:', hostname);
        return '';
    }

    // Check if this is an IP address (simple check for IPv4)
    if (/^(\d{1,3}\.){3}\d{1,3}$/.test(hostname)) {
        return hostname; // Return IP addresses as-is
    }

    const domainParts = hostname.split('.');

    // Handle hostnames that are too short
    if (domainParts.length < 2) {
        return hostname; // Return as-is if it's a single-part hostname
    }

    // Try to find a matching public suffix
    for (let i = 0; i < domainParts.length; i++) {
        const candidate = domainParts.slice(i).join('.');

        // Check both normal and exception (prefixed with !) entries
        if (publicSuffixSet.has(candidate) || publicSuffixSet.has(`!${candidate}`)) {
            // If we found a match, return the ___domain part plus the public suffix
            // But make sure we don't go out of bounds
            if (i > 0) {
                return domainParts.slice(i - 1).join('.');
            } else {
                // Edge case: the entire hostname is a public suffix
                return hostname;
            }
        }
    }

    // If no match in public suffix list, use a simple fallback:
    // For hostnames with 2 parts, return the whole thing
    // For hostnames with >2 parts, return the last 2 parts
    if (domainParts.length === 2) {
        return hostname;
    } else {
        return domainParts.slice(-2).join('.');
    }
}

/**
 * Determines whether a given page title does *not* belong to the main or draft namespaces.
 *
 * @param {string} pageTitle - The title of the page.
 * @returns {boolean} True if not an article namespace.
 */
function isNotArticle(pageTitle) {
    return namespaces.some(namespace => pageTitle.startsWith(namespace));
}

/**
 * Cleans up expired localStorage cache entries based on known cache key prefixes and TTLs.
 */
function purgeExpiredCache() {
    const now = Date.now();
    // Define cache configurations with their TTLs in milliseconds
    const knownCaches = [
        { prefix: 'revisionDiff:', ttl: 30 * 24 * 60 * 60 * 1000, description: 'Revision diff cache' },
        { prefix: 'domainList:', ttl: 6 * 60 * 60 * 1000, description: 'Domain list cache' },
        { prefix: 'publicSuffixListCache', ttl: 24 * 60 * 60 * 1000, description: 'Public suffix list cache' },
        { prefix: 'citationWatchlistFetchDomainListPages_', ttl: 4 * 60 * 60 * 1000, description: 'Domain list pages cache' }
    ];

    // Track statistics for logging
    const stats = { checked: 0, expired: 0, errors: 0 };

    try {
        // Iterate through all localStorage items
        for (let i = 0; i < localStorage.length; i++) {
            const key = localStorage.key(i);
            if (!key) continue; // Skip if key is null (shouldn't happen but being defensive)

            // Check if this key belongs to one of our known caches
            for (const cache of knownCaches) {
                if (key.startsWith(cache.prefix)) {
                    stats.checked++;
                    try {
                        if (key.endsWith('_timestamp')) {
                            // Handle paired key-timestamp entries
                            const baseKey = key.replace(/_timestamp$/, '');
                            const timestampStr = localStorage.getItem(key);

                            if (!timestampStr) {
                                // Orphaned timestamp key without a value
                                localStorage.removeItem(key);
                                console.log(`Removed orphaned timestamp key: ${key}`);
                                stats.expired++;
                                continue;
                            }

                            const timestamp = parseInt(timestampStr, 10);
                            if (isNaN(timestamp) || now - timestamp > cache.ttl) {
                                // Expired or invalid timestamp
                                localStorage.removeItem(key);

                                // Also remove the base key if it exists
                                if (localStorage.getItem(baseKey) !== null) {
                                    localStorage.removeItem(baseKey);
                                    console.log(`Purged expired ${cache.description}: ${baseKey}`);
                                } else {
                                    console.log(`Removed orphaned timestamp for missing key: ${baseKey}`);
                                }
                                stats.expired++;
                            }
                        } else {
                            // Handle JSON entries with embedded timestamps
                            const value = localStorage.getItem(key);
                            if (!value) {
                                // Empty value, just remove it
                                localStorage.removeItem(key);
                                console.log(`Removed empty cache entry: ${key}`);
                                stats.expired++;
                                continue;
                            }

                            try {
                                const parsed = JSON.parse(value);
                                if (parsed && parsed.timestamp && now - parsed.timestamp > cache.ttl) {
                                    // Expired based on embedded timestamp
                                    localStorage.removeItem(key);
                                    console.log(`Purged expired ${cache.description}: ${key}`);
                                    stats.expired++;
                                }
                            } catch (jsonError) {
                                // Invalid JSON, remove the entry
                                localStorage.removeItem(key);
                                console.warn(`Removed invalid JSON cache entry: ${key}`, jsonError.message);
                                stats.errors++;
                                stats.expired++;
                            }
                        }
                    } catch (itemError) {
                        console.warn(`Error processing cache item ${key}:`, itemError.message);
                        stats.errors++;
                        // Try to remove problematic entries
                        try {
                            localStorage.removeItem(key);
                            console.log(`Removed problematic cache entry: ${key}`);
                            stats.expired++;
                        } catch (removeError) {
                            console.error(`Failed to remove problematic entry ${key}:`, removeError.message);
                        }
                    }
                    break; // Once we've matched a cache type, no need to check others
                }
            }
        }

        // Log summary statistics
        if (stats.checked > 0) {
            console.log(`Cache cleanup complete: checked ${stats.checked} items, removed ${stats.expired} expired items, encountered ${stats.errors} errors`);
        }
    } catch (globalError) {
        console.error('Fatal error during cache cleanup:', globalError.message);
    }
}

analyzeView().then(() => console.log(
    'Citation Watchlist script finished executing'));