User:Harej/citation-watchlist.js

Note: After saving, you have to bypass your browser's cache to see the changes. Google Chrome, Firefox, Microsoft Edge and Safari: Hold down the ⇧ Shift key and click the Reload toolbar button. For details and instructions about other browsers, see Wikipedia:Bypass your cache.
/**
 *
 * Citation Watchlist
 * https://en.wikipedia.org/wiki/WP:WATCHCITE
 *
 */
/**
 * ==========================================================================
 * Domain List Configuration
 * ==========================================================================
 *
 * Citation Watchlist requires the following wiki pages to function:
 *
 * 1. Public Suffix List
 *    - A local copy of the public suffix list, used for ___domain parsing.
 *    - Copy the contents of:
 *        https://en.wikipedia.org/wiki/Wikipedia:Citation_Watchlist/Public_Suffix_List
 *      to a page on your own wiki.
 *    - Update the `publicSuffixList` variable below to reflect your page title.
 *
 * 2. List of Lists
 *    - A page linking to one or more ___domain list pages.
 *    - Format as a bullet list: "* [[Page Title]]" (space after asterisk).
 *    - Reference formatting example:
 *        https://en.wikipedia.org/wiki/Wikipedia:Citation_Watchlist/Lists
 *    - Update the `listOfLists` variable below accordingly.
 *
 * 3. Domain List Pages
 *    - One or more pages listing suspicious or noteworthy domains.
 *    - Each page must contain section headers that match the `indicators` config
 *      below (e.g., "==Warn==", "==Caution==").
 *    - Under each section, list domains in the format: "* example.com"
 *    - Do not use link formatting—just plain text.
 */


const publicSuffixList = "Wikipedia:Citation_Watchlist/Public_Suffix_List";
const listOfLists = "Wikipedia:Citation_Watchlist/Lists";


/**
 * ==========================================================================
 * Indicator Configuration
 * ==========================================================================
 *
 * Defines metadata for ___domain indicators used in the watchlist UI.
 * Each indicator is associated with a level of urgency and a unique symbol.
 *
 * Fields:
 * - msg:     Display label for the level (e.g., "Warning", "Caution").
 * - emoji:   Unicode character for the visual indicator (escaped as `\uXXXX`).
 * - section: Must exactly match the section headers in the ___domain list pages.
 * - priority: Higher values override lower ones for conflicting ___domain matches.
 *             Priority scale: 1 (lowest) to N (highest).
 * - list:    Defined as "new Set()" for all indicator types.
 *
 * If a ___domain appears in multiple lists, the one with the highest priority
 * takes precedence.
 */


const indicators = {
    warning: {
        msg: "Warning",
        emoji: '\u2757',
        section: "==Warn==",
        priority: 3,
        list: new Set()
    },
    caution: {
        msg: "Caution",
        emoji: '\u270B',
        section: "==Caution==",
        priority: 2,
        list: new Set()
    },
    inspect: {
        msg: "Inspect",
        emoji: '\uD83D\uDD0E',
        section: "==Inspect==",
        priority: 1,
        list: new Set()
    },
    removed: {
        msg: "Removed",
        emoji: '\u{1F5D1}',
        section: null,
        priority: -1,
        list: new Set()
    }
};


/**
 * Citation Watchlist
 *
 * Highlights potentially questionable citations added in Wikipedia revisions,
 * using predefined ___domain lists and a public suffix list to analyze diffs.
 *
 *
 * Documentation: https://en.wikipedia.org/wiki/WP:WATCHCITE
 *
 * Author: James Hare under contract with Hacks/Hackers
 * License: GNU General Public License v3.0 (GPL-3.0)
 *
 * @version 1.14
 * @since 2025-06-27
 */


/**
 * If you want to modify any part of the script below this point, please submit
 * your edits to https://test.wikipedia.org/wiki/User:Harej/citation-watchlist.js
 * so that your modifications can be tested.
 */

let publicSuffixSet = new Set();
const namespaces = Object.entries(mw.config.get('wgFormattedNamespaces'))
    .filter(([num, name]) => num !== '0' && num !== '118')
    .map(([_, name]) => name.replace(/ /g, '_') + ':');

/**
 * Main entry point for Citation Watchlist.
 * Determines if the current page should be analyzed, fetches ___domain and suffix
 * lists, processes each change/revision in the recent changes or history page,
 * and triggers analysis to highlight questionable domains.
 */
async function analyzeView() {
    purgeExpiredCache();
    const ns = mw.config.get('wgNamespaceNumber');
    if (![-1, 0, 118].includes(ns)) {
        return;
    }
    publicSuffixSet = await fetchPublicSuffixList();
    if (publicSuffixSet.size === 0) {
        console.error('Public Suffix List loading failed');
        return;
    }
    console.log("Welcome to Citation Watchlist");
    const listPages = await fetchDomainListPages(listOfLists);
    if (listPages) {
        const lists = await fetchAndOrganizeDomainLists(listPages);
        if (lists) {
            for (const type in indicators) {
                lists[type].list.forEach(indicators[type].list.add, indicators[type].list);
            }
        }
    }
    const entriesContainers = document.querySelectorAll('.mw-changeslist-links');
    let noLinks = true;
    for (const container of entriesContainers) {
        const diffLink = container.querySelector('a.mw-changeslist-diff');
        const histLink = container.querySelector('a.mw-changeslist-history');
        const prevLink = container.querySelector(
            'a.mw-history-histlinks-previous');
        const curLink = container.querySelector('a.mw-history-histlinks-current');
        let revision = null;
        let urlParams = '';
        if (diffLink) {
            noLinks = false;
            const diffUrl = new URL(diffLink.href);
            urlParams = new URLSearchParams(diffUrl.search);
            const pageTitle = urlParams.get('title');
            if (isNotArticle(pageTitle)) continue;
            revision = {
                oldrevision: urlParams.get('diff'),
                newrevision: urlParams.get('oldid'),
                element: diffLink.parentNode.parentNode
            };
            if (revision.oldrevision === 'prev') { // This happens on user contributions pages
                const previousRevisionMap = await fetchPreviousRevisionIds(
                    [revision.newrevision]);
                revision.oldrevision = revision.newrevision;
                revision.newrevision = previousRevisionMap[revision.newrevision];
            }
        } else if (histLink) {
            noLinks = false;
            const histUrl = new URL(histLink.href);
            urlParams = new URLSearchParams(histUrl.search);
            const pageTitle = urlParams.get('title');
            if (isNotArticle(pageTitle)) continue;
            const firstID = await fetchFirstRevisionId(pageTitle);
            if (!firstID) continue;
            revision = {
                oldrevision: firstID,
                element: histLink.parentNode.parentNode
            };
        } else if (prevLink) {
            noLinks = false;
            urlParams = new URLSearchParams(prevLink.href);
            const previousRevisionMap = await fetchPreviousRevisionIds(
                [urlParams.get('oldid')]);
            revision = {
                oldrevision: urlParams.get('oldid'),
                newrevision: previousRevisionMap[urlParams.get('oldid')],
                element: prevLink.parentNode.parentNode
            };
        } else if (curLink) {
            noLinks = false;
            urlParams = new URLSearchParams(curLink.href);
            revision = {
                oldrevision: urlParams.get('oldid'),
                element: curLink.parentNode.parentNode
            };
        }
        if (revision) {
            await analyzeRevision(revision);
        }
    }
    // If no links were found, extract the first revision ID
    if (noLinks == true) {
        const pageTitle = mw.config.get('wgTitle');
        const firstID = await fetchFirstRevisionId(pageTitle);
        revision = {
            oldrevision: firstID,
            element: entriesContainers[0]
        };
        await analyzeRevision(revision);
    }
}

/**
 * Analyzes a revision (or a pair of revisions) for newly added URLs,
 * compares them against ___domain watchlists, and highlights matches.
 *
 * @param {Object} revision - Object containing oldrevision, optional newrevision, and DOM element.
 */
async function analyzeRevision(revision) {
    const lookup = [revision.oldrevision];
    if (revision.newrevision) {
        lookup.push(revision.newrevision);
    }
    const wikiDomain = ___location.hostname;
    const cacheKey = `revisionDiff:${wikiDomain}:${revision.oldrevision}:${revision.newrevision || 'null'}`;
    const oneMonth = 30 * 24 * 60 * 60 * 1000;
    let addedURLs = [];
    let removedURLs = [];

    // Try reading from cache
    const cached = localStorage.getItem(cacheKey);
    if (cached) {
        try {
            const parsed = JSON.parse(cached);
            const age = Date.now() - parsed.timestamp;
            if (age < oneMonth) {
                console.log(`Cache hit for revision ${cacheKey}`);
                if (Array.isArray(parsed.addedURLs)) {
                    addedURLs = parsed.addedURLs;
                }
                if (Array.isArray(parsed.removedURLs)) {
                    removedURLs = parsed.removedURLs;
                }
            }
        } catch (e) {
            console.warn('Cache parse error, refetching:', e);
        }
    }

    // If not cached, fetch and process
    if (addedURLs.length === 0 && removedURLs.length === 0) {
        const wikitext = await fetchRevisionContent(lookup);
        const fromURLs = new Set(extractURLs(wikitext.oldrevision) || []);
        const toURLs = new Set(extractURLs(wikitext.newrevision) || []);

        if (revision.newrevision) {
            // URLs present in new revision but not in old revision = added
            addedURLs = [...toURLs].filter(url => !fromURLs.has(url));
            // URLs present in old revision but not in new revision = removed
            removedURLs = [...fromURLs].filter(url => !toURLs.has(url));
        } else {
            // For first revision, all URLs are considered added
            addedURLs = Array.from(fromURLs);
            removedURLs = [];
        }
        try {
            localStorage.setItem(cacheKey, JSON.stringify({
                timestamp: Date.now(),
                addedURLs,
                removedURLs
            }));
        } catch (e) {
            console.warn('Failed to store cache:', e);
        }
    }
    console.log(`Revision element: ${revision.element.innerHTML}
  Added URLs: ${addedURLs.join(' ')}
  Removed URLs: ${removedURLs.join(' ')}
  `);

    // Match domains to indicator types
    const matchedDomains = Object.keys(indicators).reduce((acc, key) => {
        acc[key] = [];
        return acc;
    }, {});

    // Process removed URLs first - these always get the "removed" indicator
    if (removedURLs.length > 0) {
        const removedDomains = [];
        for (const url of removedURLs) {
            try {
                const hostname = new URL(url).hostname;
                const ___domain = getRootDomain(hostname, publicSuffixSet);
                if (!removedDomains.includes(___domain)) {
                    removedDomains.push(___domain);
                }
            } catch (e) {
                console.warn(`Error processing removed URL ${url}:`, e);
            }
        }
        matchedDomains.removed = removedDomains;
    }

    // Process added URLs
    for (const url of addedURLs) {
        try {
            const hostname = new URL(url).hostname;
            const ___domain = getRootDomain(hostname, publicSuffixSet);
            let highestPriorityType = null;
            for (const type in indicators) {
                if (type !== 'removed' && indicators[type].list.has(___domain)) {
                    if (
                        highestPriorityType === null ||
                        indicators[type].priority > indicators[highestPriorityType].priority
                    ) {
                        highestPriorityType = type;
                    }
                }
            }
            if (
                highestPriorityType !== null &&
                !matchedDomains[highestPriorityType].includes(___domain)
            ) {
                matchedDomains[highestPriorityType].push(___domain);
                // Remove this ___domain from lower priority lists
                for (const type in indicators) {
                    if (
                        type !== 'removed' && // Never remove from "removed" list
                        indicators[type].priority < indicators[highestPriorityType].priority
                    ) {
                        matchedDomains[type] = matchedDomains[type].filter(d => d !== ___domain);
                    }
                }
            }
        } catch (e) {
            console.warn(`Error processing added URL ${url}:`, e);
        }
    }

    // Prepend emoji indicators - "removed" indicator should appear even if other indicators are present
    for (const type in indicators) {
        if (matchedDomains[type] && matchedDomains[type].length > 0) {
            prependEmojiWithTooltip(revision.element, type, matchedDomains[type]);
        }
    }
}

/**
 * Prepends an emoji and tooltip to a revision list entry DOM element if any
 * domains matched a warning list.
 *
 * @param {HTMLElement} element - The container element to prepend the emoji to.
 * @param {string} type - The type of indicator ('warning', 'caution', 'inspect').
 * @param {string[]} domains - The list of matched domains for the indicator.
 */
function prependEmojiWithTooltip(element, type, domains) {
    const indicator = indicators[type];
    if (!indicator || element.getAttribute(`data-processed-${type}`) === 'true') {
        return;
    }
    const emojiSpan = document.createElement('span');
    emojiSpan.textContent = indicator.emoji + " ";
    emojiSpan.title = `${indicator.msg}: ${domains.join(", ")}`;
    element.parentNode.insertBefore(emojiSpan, element);
    element.setAttribute(`data-processed-${type}`, 'true');
}

/**
 * Extracts the first page object from MediaWiki API query response.
 *
 * @param {Object} data - MediaWiki API response.
 * @returns {Object|null} The first page object or null if unavailable.
 */
async function getFirstPage(data) {
    if (!data || !data.query || !data.query.pages) return null;
    const pages = data.query.pages;
    return Object.values(pages)[0]; // Return the first page
}

/**
 * Retrieves the first revision from a page object.
 *
 * @param {Object} page - Page object containing revisions.
 * @returns {Object|null} First revision object or null.
 */
async function getFirstRevision(page) {
    if (page.revisions && page.revisions.length > 0) {
        return page.revisions[0];
    }
    return null;
}

/**
 * Fetches wikitext content for one or two revisions by ID.
 *
 * @param {string[]} revIds - Array of revision IDs.
 * @returns {Object} Object with `oldrevision` and optionally `newrevision` as wikitext strings.
 */
async function fetchRevisionContent(revIds) {
    const data = await fetchRevisionData({
        revids: revIds,
        rvprop: ['content'],
        rvslots: ['main']
    });
    const page = await getFirstPage(data);
    const wikitext = { oldrevision: null, newrevision: null };
    if (page.revisions && page.revisions.length > 0) {
        wikitext.oldrevision = page.revisions[0].slots.main['*'] || null;
        if (page.revisions.length > 1) {
            wikitext.newrevision = page.revisions[1].slots.main['*'] || null;
        }
    }
    return wikitext;
}

/**
 * Fetches the parent revision IDs for a given list of revision IDs.
 *
 * @param {string[]} revisionIds - Array of revision IDs.
 * @returns {Object} Map of revision ID to its parent ID.
 */
async function fetchPreviousRevisionIds(revisionIds) {
    const data = await fetchRevisionData({
        revids: revisionIds,
        rvprop: ['ids']
    });
    const page = await getFirstPage(data);
    if (!page) return {};
    const revisionMap = {};
    for (const revision of page.revisions) {
        revisionMap[revision.revid] = revision.parentid;
    }
    return revisionMap;
}

/**
 * Fetches the ID of the first revision of a page.
 *
 * @param {string} pageTitle - The page title to look up.
 * @returns {number|null} Revision ID or null.
 */
async function fetchFirstRevisionId(pageTitle) {
    const data = await fetchRevisionData({
        titles: [pageTitle],
        rvlimit: 1,
        rvdir: 'newer',
        rvprop: ['ids'],
    });
    const page = await getFirstPage(data);
    if (!page) return null;
    const revision = await getFirstRevision(page);
    return revision ? revision.revid : null;
}

/**
 * Fetches the list of subpages from the list of lists, parses wikilinks, caches
 * the result, and returns list of subpage titles.
 *
 * @param {string} pageName - Title of the list-of-lists page.
 * @returns {Promise<string[]>} List of subpage titles.
 */
async function fetchDomainListPages(pageName) {
    const cacheKey = `citationWatchlistFetchDomainListPages_${pageName}`;
    const cacheExpiration = 4 * 60 * 60 * 1000;
    const now = Date.now();
    const cachedData = localStorage.getItem(cacheKey);
    const cachedTimestamp = localStorage.getItem(`${cacheKey}_timestamp`);
    if (cachedData && cachedTimestamp && (now - parseInt(cachedTimestamp, 10)) <
        cacheExpiration) {
        console.log("Loaded list of lists from cache");
        return JSON.parse(cachedData);
    }
    const data = await fetchRevisionData({
        titles: [pageName],
        rvprop: ['content'],
        rvslots: ['*']
    });
    const page = await getFirstPage(data);
    if (!page) return [];
    const content = page.revisions[0].slots.main['*'];
    const pageTitles = [];
    const lines = content.split('\n');
    for (let line of lines) {
        if (line.startsWith('* [[')) {
            const match = line.match(
                /\[\[([^\]]+)\]\]/); // Matches the first instance of [[Page Title]]
            if (match) {
                pageTitles.push(match[1]);
            }
        }
    }
    localStorage.setItem(cacheKey, JSON.stringify(pageTitles));
    localStorage.setItem(`${cacheKey}_timestamp`, now.toString());
    console.log("Loaded from API and stored in cache");
    return pageTitles;
}

/**
 * Loads ___domain lists from a set of pages, categorizes them by indicator section
 * headers, and populates the corresponding `Set` in the global `indicators` object.
 *
 * @param {string[]} pageNames - List of page titles to fetch.
 * @returns {Object} Updated indicators object with ___domain sets.
 */
async function fetchAndOrganizeDomainLists(pageNames) {
    const cacheTTL = 6 * 60 * 60 * 1000;
    const now = Date.now();
    const cachedData = {};
    const pagesToFetch = [];
    for (const title of pageNames) {
        const cacheKey = `domainList:${___location.hostname}:${title}`;
        const cached = localStorage.getItem(cacheKey);
        if (cached) {
            try {
                const parsed = JSON.parse(cached);
                if (now - parsed.timestamp < cacheTTL && parsed.content) {
                    console.log(`Using cached content for page: ${title}`);
                    cachedData[title] = parsed.content;
                    continue;
                } else {
                    console.log(`Cache expired for page: ${title}`);
                }
            } catch (e) {
                console.warn(`Cache error for ${title}:`, e);
            }
        }
        console.log(`Will fetch page: ${title}`);
        pagesToFetch.push(title);
    }
    let fetchedPages = {};
    if (pagesToFetch.length > 0) {
        const apiData = await fetchRevisionData({
            titles: pagesToFetch,
            rvprop: ['content'],
            rvslots: ['*'],
        });
        const pages = apiData.query.pages;
        for (const pageId in pages) {
            const page = pages[pageId];
            const title = page.title;
            const content = page.revisions[0].slots.main['*'];
            fetchedPages[title] = content;
            const cacheKey = `domainList:${___location.hostname}:${title}`;
            try {
                localStorage.setItem(cacheKey, JSON.stringify({
                    timestamp: now,
                    content,
                }));
                console.log(`Cached content for page: ${title}`);
            } catch (e) {
                console.warn(`Failed to cache ${title}:`, e);
            }
        }
    }
    const allContent = { ...cachedData, ...fetchedPages };
    for (const title in allContent) {
        const content = allContent[title];
        let currentList = null;
        const lines = content.split('\n');
        for (let line of lines) {
            for (const type in indicators) {
                if (line.trim() === indicators[type].section) {
                    currentList = indicators[type].list;
                    break;
                }
            }
            if (line.startsWith('*') && currentList) {
                const ___domain = line.substring(1).trim();
                // Discard ___domain entries with slashes (indicating domains with paths)
                if (!___domain.includes('/')) {
                    currentList.add(___domain);
                }
            }
        }
    }
    return indicators;
}

/**
 * Fetches and caches the public suffix list used to identify top-level domains.
 *
 * @returns {Promise<Set<string>>} Set of public suffixes.
 */
async function fetchPublicSuffixList() {
    const cacheKey = 'publicSuffixListCache';
    const cacheTTL = 24 * 60 * 60 * 1000;
    const cached = localStorage.getItem(cacheKey);
    if (cached) {
        try {
            const parsed = JSON.parse(cached);
            const age = Date.now() - parsed.timestamp;
            if (age < cacheTTL && parsed.content) {
                console.log('Using cached public suffix list');
                return new Set(parsed.content.split('\n').filter(line =>
                    line.trim() && !line.trim().startsWith('//')
                ).map(line => line.trim()));
            }
        } catch (e) {
            console.warn('Error parsing cache, refetching:', e);
        }
    }
    const pslUrl = mw.config.get('wgArticlePath').replace('$1', publicSuffixList)
        + '?action=raw';
    console.log(`Raw page text request: ${pslUrl}`);
    const content = await safeFetch(fetch, pslUrl).then(response => response ?
        response.text() : null);
    if (!content) return new Set();
    try {
        localStorage.setItem(cacheKey, JSON.stringify({
            timestamp: Date.now(),
            content
        }));
    } catch (e) {
        console.warn('Failed to write to cache:', e);
    }
    const suffixSet = new Set();
    const lines = content.split('\n');
    for (const line of lines) {
        if (line.trim() && !line.trim().startsWith('//')) {
            suffixSet.add(line.trim());
        }
    }
    return suffixSet;
}

/**
 * Makes a MediaWiki API call to fetch revision metadata or content.
 *
 * @param {Object} data - Options for the API call, such as `revids`, `titles`, `rvprop`, etc.
 * @returns {Promise<Object>} MediaWiki API query result.
 */
async function fetchRevisionData(data) {
    const paramKeys = ['rvprop', 'revids', 'titles', 'rvslots'];
    const params = {
        action: 'query',
        prop: 'revisions',
        format: 'json',
        rvdir: data.rvdir || 'older',
        origin: '*'
    };
    if (data.rvlimit) { params.rvlimit = data.rvlimit; }
    paramKeys.forEach(key => {
        if (data[key]) {
            params[key] = Array.isArray(data[key]) ? data[key].join('|') : data[key];
        }
    });
    const api = new mw.Api();
    return await safeFetch(api.get.bind(api), params);
}

/**
 * Wraps any asynchronous fetch function with retry logic and error handling.
 *
 * @param {Function} fn - The function to execute (usually an API call).
 * @param {...any} args - Arguments to pass to the fetch function.
 * @param {Object} options - Optional configuration for the fetch operation.
 * @param {number} options.retries - Number of retry attempts (default: 2).
 * @param {number} options.retryDelay - Delay between retries in ms (default: 1000).
 * @returns {Promise<any|null>} Result of the fetch or null on failure.
 */
async function safeFetch(fn, ...args) {
    // Extract options if the last argument is an options object
    let options = { retries: 2, retryDelay: 1000 };
    if (args.length > 0 && typeof args[args.length - 1] === 'object' && args[args.length - 1]._isSafeFetchOptions) {
        options = { ...options, ...args.pop() };
    }

    let lastError = null;
    let attempt = 0;
    const maxAttempts = options.retries + 1;

    while (attempt < maxAttempts) {
        try {
            attempt++;
            const result = await fn(...args);

            // Check if the result is valid (not null or undefined)
            if (result === null || result === undefined) {
                throw new Error('Received null or undefined response');
            }

            // For fetch API responses, check if the status is ok
            if (result && typeof result.ok === 'boolean' && !result.ok) {
                throw new Error(`HTTP error ${result.status}: ${result.statusText || 'Unknown error'}`);
            }

            return result;
        } catch (error) {
            lastError = error;

            // Log the error with attempt information
            if (attempt < maxAttempts) {
                console.warn(`Error during ${fn.name || 'fetch operation'} (attempt ${attempt}/${maxAttempts}):`,
                    error.message || error);

                // Wait before retrying
                await new Promise(resolve => setTimeout(resolve, options.retryDelay));
            } else {
                // Final attempt failed
                console.error(`All ${maxAttempts} attempts failed for ${fn.name || 'fetch operation'}:`,
                    error.message || error);
            }
        }
    }

    // All attempts failed
    return null;
}

// Helper function to create options for safeFetch
safeFetch.withOptions = function(retries, retryDelay) {
    return {
        retries: retries || 2,
        retryDelay: retryDelay || 1000,
        _isSafeFetchOptions: true
    };
};

/**
 * Extracts all HTTP(S) URLs from a given wikitext string.
 *
 * @param {string} wikitext - Raw wikitext revision content.
 * @returns {string[]} List of valid extracted URLs.
 */
function extractURLs(wikitext) {
    const urls = [];
    if (!wikitext) return urls;

    const urlRegex = /https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&//=]*)/g;
    let match;
    while ((match = urlRegex.exec(wikitext)) !== null) {
        try {
            // Use URL constructor for validation and normalization
            const url = new URL(match[0]);
            // Only add if it's not already in the list (avoid duplicates)
            if (!urls.includes(url.href)) {
                urls.push(url.href);
            }
        } catch (error) {
            console.error(`Invalid URL rejected: ${match[0]}`, error.message);
        }
    }
    return urls;
}

/**
 * Extracts the top-level ___domain from a full hostname using a public suffix set.
 *
 * @param {string} hostname - Full hostname (e.g., sub.example.co.uk).
 * @param {Set<string>} publicSuffixSet - Set of known public suffixes.
 * @returns {string} The top-level ___domain (e.g., example.co.uk).
 */
function getRootDomain(hostname, publicSuffixSet) {
    // Handle empty or invalid hostnames
    if (!hostname || typeof hostname !== 'string') {
        console.warn('Invalid hostname provided to getRootDomain:', hostname);
        return '';
    }

    // Check if this is an IP address (simple check for IPv4)
    if (/^(\d{1,3}\.){3}\d{1,3}$/.test(hostname)) {
        return hostname; // Return IP addresses as-is
    }

    const domainParts = hostname.split('.');

    // Handle hostnames that are too short
    if (domainParts.length < 2) {
        return hostname; // Return as-is if it's a single-part hostname
    }

    // Try to find a matching public suffix
    for (let i = 0; i < domainParts.length; i++) {
        const candidate = domainParts.slice(i).join('.');

        // Check both normal and exception (prefixed with !) entries
        if (publicSuffixSet.has(candidate) || publicSuffixSet.has(`!${candidate}`)) {
            // If we found a match, return the ___domain part plus the public suffix
            // But make sure we don't go out of bounds
            if (i > 0) {
                return domainParts.slice(i - 1).join('.');
            } else {
                // Edge case: the entire hostname is a public suffix
                return hostname;
            }
        }
    }

    // If no match in public suffix list, use a simple fallback:
    // For hostnames with 2 parts, return the whole thing
    // For hostnames with >2 parts, return the last 2 parts
    if (domainParts.length === 2) {
        return hostname;
    } else {
        return domainParts.slice(-2).join('.');
    }
}

/**
 * Determines whether a given page title does *not* belong to the main or draft namespaces.
 *
 * @param {string} pageTitle - The title of the page.
 * @returns {boolean} True if not an article namespace.
 */
function isNotArticle(pageTitle) {
    return namespaces.some(namespace => pageTitle.startsWith(namespace));
}

/**
 * Cleans up expired localStorage cache entries based on known cache key prefixes and TTLs.
 */
function purgeExpiredCache() {
    const now = Date.now();
    // Define cache configurations with their TTLs in milliseconds
    const knownCaches = [
        { prefix: 'revisionDiff:', ttl: 30 * 24 * 60 * 60 * 1000, description: 'Revision diff cache' },
        { prefix: 'domainList:', ttl: 6 * 60 * 60 * 1000, description: 'Domain list cache' },
        { prefix: 'publicSuffixListCache', ttl: 24 * 60 * 60 * 1000, description: 'Public suffix list cache' },
        { prefix: 'citationWatchlistFetchDomainListPages_', ttl: 4 * 60 * 60 * 1000, description: 'Domain list pages cache' }
    ];

    // Track statistics for logging
    const stats = { checked: 0, expired: 0, errors: 0 };

    try {
        // Iterate through all localStorage items
        for (let i = 0; i < localStorage.length; i++) {
            const key = localStorage.key(i);
            if (!key) continue; // Skip if key is null (shouldn't happen but being defensive)

            // Check if this key belongs to one of our known caches
            for (const cache of knownCaches) {
                if (key.startsWith(cache.prefix)) {
                    stats.checked++;
                    try {
                        if (key.endsWith('_timestamp')) {
                            // Handle paired key-timestamp entries
                            const baseKey = key.replace(/_timestamp$/, '');
                            const timestampStr = localStorage.getItem(key);

                            if (!timestampStr) {
                                // Orphaned timestamp key without a value
                                localStorage.removeItem(key);
                                console.log(`Removed orphaned timestamp key: ${key}`);
                                stats.expired++;
                                continue;
                            }

                            const timestamp = parseInt(timestampStr, 10);
                            if (isNaN(timestamp) || now - timestamp > cache.ttl) {
                                // Expired or invalid timestamp
                                localStorage.removeItem(key);

                                // Also remove the base key if it exists
                                if (localStorage.getItem(baseKey) !== null) {
                                    localStorage.removeItem(baseKey);
                                    console.log(`Purged expired ${cache.description}: ${baseKey}`);
                                } else {
                                    console.log(`Removed orphaned timestamp for missing key: ${baseKey}`);
                                }
                                stats.expired++;
                            }
                        } else {
                            // Handle JSON entries with embedded timestamps
                            const value = localStorage.getItem(key);
                            if (!value) {
                                // Empty value, just remove it
                                localStorage.removeItem(key);
                                console.log(`Removed empty cache entry: ${key}`);
                                stats.expired++;
                                continue;
                            }

                            try {
                                const parsed = JSON.parse(value);
                                if (parsed && parsed.timestamp && now - parsed.timestamp > cache.ttl) {
                                    // Expired based on embedded timestamp
                                    localStorage.removeItem(key);
                                    console.log(`Purged expired ${cache.description}: ${key}`);
                                    stats.expired++;
                                }
                            } catch (jsonError) {
                                // Invalid JSON, remove the entry
                                localStorage.removeItem(key);
                                console.warn(`Removed invalid JSON cache entry: ${key}`, jsonError.message);
                                stats.errors++;
                                stats.expired++;
                            }
                        }
                    } catch (itemError) {
                        console.warn(`Error processing cache item ${key}:`, itemError.message);
                        stats.errors++;
                        // Try to remove problematic entries
                        try {
                            localStorage.removeItem(key);
                            console.log(`Removed problematic cache entry: ${key}`);
                            stats.expired++;
                        } catch (removeError) {
                            console.error(`Failed to remove problematic entry ${key}:`, removeError.message);
                        }
                    }
                    break; // Once we've matched a cache type, no need to check others
                }
            }
        }

        // Log summary statistics
        if (stats.checked > 0) {
            console.log(`Cache cleanup complete: checked ${stats.checked} items, removed ${stats.expired} expired items, encountered ${stats.errors} errors`);
        }
    } catch (globalError) {
        console.error('Fatal error during cache cleanup:', globalError.message);
    }
}

analyzeView().then(() => console.log(
    'Citation Watchlist script finished executing'));