Wikidata:Database reports/removed sitelinks/Configuration

The most recent version of this script can be found at Github: removed_sitelinks. The script is run on Toolforge in the deltabot tool account; Toolforge users usually have read-access to all scripts in that tool account.

A potentially outdated version is stored onwiki at User:DeltaBot/source/removed_sitelinks in order to be permanently available and conveniently accessible; it is being displayed on this page with code formatting. Mind that the onwiki source code might have been slightly altered in order to prevent onlyinclude directives from being effective, and that the wikitext parser im some situations messes up the formatted output of the transcluding page content including the source code.

#!/usr/bin/python
# -*- coding: UTF-8 -*-
# licensed under CC-Zero: https://creativecommons.org/publicdomain/zero/1.0

from datetime import datetime
from os.path import expanduser
import re
from time import strftime

import mariadb
import pywikibot as pwb


DBNAMES = [
    'dewiki',
    'fawiki',
    'frwiki',
    'nlwiki',
    'ptwiki',
    'ruwiki',
]

HEADER = """A list of {dbname} pages which were disconnected from Wikidata. Data as of <only""" + """include>{update_timestamp}</onlyinclude>.

{{| class="wikitable sortable" style="width:100%; margin:auto;"
|-
! Page !! Item !! Comment !! User !! Time
"""

TABLE_ROW = """|-
| [[:{iw_prefix}:{page_title}]] || [[{qid}]] || {comment} || [[User:{user_name}|{user_name}]] || {timestamp}
"""

FOOTER = """|}

[[Category:Database reports|removed sitelinks]]"""


def make_report(dbname:str) -> str:
    db = mariadb.connect(
        host=f'{dbname}.analytics.db.svc.wikimedia.cloud',
        database=f'{dbname}_p',
        default_file=f'{expanduser("~")}/replica.my.cnf',
    )
    cur = db.cursor(dictionary=True)

    query = """SELECT
    CONVERT(page_title USING utf8) AS page_title,
    CONVERT(comment_text USING utf8) AS comment_text,
    CONVERT(actor_name USING utf8) AS actor_name,
    CONVERT(rc_timestamp USING utf8) AS rc_timestamp,
    CONVERT(rc_params USING utf8) AS rc_params
FROM
    page
        JOIN recentchanges ON rc_cur_id=page_id
            JOIN comment_recentchanges ON rc_comment_id=comment_id
            JOIN actor_recentchanges ON rc_actor=actor_id
        LEFT JOIN page_props ON page_id=pp_page AND pp_propname='wikibase_item'
WHERE
    rc_type=5
    AND rc_source='wb'
    AND page_namespace=0
    AND page_is_redirect=0
    AND pp_page IS NULL
ORDER BY
    page_title ASC"""

    cur.execute(query)
    result = cur.fetchall()
    cur.close()
    db.close()

    text = ''
    for row in result:
        page_title = row.get('page_title')
        rc_comment = row.get('comment_text')
        rc_user_text = row.get('actor_name')
        rc_timestamp = row.get('rc_timestamp')
        rc_params = row.get('rc_params')

        if page_title is None or rc_comment is None or rc_user_text is None or rc_timestamp is None or rc_params is None:
            continue

        if dbname not in rc_params:
            continue

        timestamp = datetime.strptime(rc_timestamp, '%Y%m%d%H%M%S')

        res = re.search('"(Q\d+)"', rc_params)
        if res:
            qid = res.group(1)
        else:
            qid = ''

        text += TABLE_ROW.format(
            iw_prefix=dbname[:-4],
            page_title=page_title.replace('_', ' '),
            qid=qid,
            comment=rc_comment,
            user_name=rc_user_text.replace('wikidata>', ''),
            timestamp=timestamp.strftime('%Y-%m-%d %H:%M'),
        )


    return text


def main() -> None:
    for dbname in DBNAMES:
        text = HEADER.format(dbname=dbname, update_timestamp=strftime('%Y-%m-%d %H:%M (%Z)')) + make_report(dbname) + FOOTER

        page = pwb.Page(pwb.Site('wikidata', 'wikidata'), f'Wikidata:Database reports/removed sitelinks/{dbname}')
        page.text = text
        page.save(summary='Bot:Updating database report', minor=False)


if __name__=='__main__':
    main()