Wikidata:Database reports/Humans with missing claims/Configuration

The most recent version of this script can be found at Github: humans_with_missing_claims. The script is run on Toolforge in the deltabot tool account; Toolforge users usually have read-access to all scripts in that tool account.

A potentially outdated version is stored onwiki at User:DeltaBot/source/humans_with_missing_claims in order to be permanently available and conveniently accessible; it is being displayed on this page with code formatting. Mind that the onwiki source code might have been slightly altered in order to prevent onlyinclude directives from being effective, and that the wikitext parser im some situations messes up the formatted output of the transcluding page content including the source code.

#!/usr/bin/python
# -*- coding: UTF-8 -*-
#licensed under CC-Zero: https://creativecommons.org/publicdomain/zero/1.0

from json.decoder import JSONDecodeError
import re
from time import strftime
from typing import Any

import pywikibot as pwb
import requests


SITE = pwb.Site('wikidata', 'wikidata')
MISSING_PROPERTIES = [ 'P21', 'P19', 'P569', 'P734', 'P735' ]
USER_AGENT = f'{requests.utils.default_user_agent()} (humans_with_missing_claims.py via User:DeltaBot at Wikidata; mailto:tools.deltabot@toolforge.org)'


def query_wdqs(query:str) -> list[dict[str, Any]]:
    response = requests.post(
        url='https://query.wikidata.org/bigdata/namespace/wdq/sparql',
        data={
            'query' : query,
        },
        timeout=65,
        headers={
            'Accept' : 'application/sparql-results+json',
            'User-Agent' : USER_AGENT,
        }
    )

    try:
        payload = response.json()
    except JSONDecodeError as exception:
        raise RuntimeWarning('Cannot parse JSON response') from exception

    return payload.get('results', {}).get('bindings', [])
    

def create_summary(counts:dict[str, dict[str, int]]) -> None:
    props = list(counts.keys())
    props.sort(key=lambda x: int(x[1:]))

    text = f'Update: <only' + f'include>{strftime("%Y-%m-%d %H:%M (%Z)")}</onlyinclude>\n\n{{| class="wikitable sortable"\n! Id !! Property '

    for p2 in MISSING_PROPERTIES:
        text += f'!! {{p2}} '

    for p1 in props:
        text += f'\n|-\n|data-sort-value={p1[1:]}| [[Property:{p1}|{p1}]] || [[/{p1}|{{p1}}]]'

        for p2 in MISSING_PROPERTIES:
            if p2 in counts[p1]:
                text += f' || [[/{p1}#{p2}|{counts[p1][p2]}]]'
            else:
                text += ' || -'

    text += '\n|}\n\nNew reports can be requested on [[/input]].\n[[Category:Database reports]]'

    page = pwb.Page(SITE, 'Wikidata:Database reports/Humans with missing claims')
    page.text = text
    page.save(summary='upd', minor=False)


def create_report(p1:str, results:dict[str, list[str]], counts:dict[str, int]) -> None:
    cnt = 0
    text = ''

    for p2 in MISSING_PROPERTIES:
        cnt_p2 = counts.get(p2)
        if cnt_p2 is None:
            continue
        
        text += f'== <span id="{p2}"></span> Missing {{p2}} ==\n'
        text += f'count: {cnt_p2}\n\n'

        for qid in results[p2]:
            cnt += 1
            if cnt < 2500:
                text += f'*{{qid}}\n'
            else:
                text += f'*[[{qid}]]\n'

        if cnt_p2 > 1000:
            skip = cnt_p2-1000
            text += f'{skip} records skipped\n'

    if len(text)==0:
        return

    text +='__FORCETOC__'

    page = pwb.Page(SITE, f'Wikidata:Database reports/Humans with missing claims/{p1}')
    page.text = text
    page.save(summary=f'report update for [[Property:{p1}]]', minor=False)


def create_lists(properties:list[str]) -> None:
    sparql = """SELECT ?item WHERE {{
    ?item wdt:{p1} [] .
    ?item wdt:P31 wd:Q5 .
    OPTIONAL {{
        ?item wdt:{p2} ?missing .
    }}
    FILTER(!BOUND(?missing)) .
}} ORDER BY ?item LIMIT 1000"""

    sparql_count = """SELECT (COUNT(DISTINCT ?item) AS ?cnt) WHERE {{
    ?item wdt:{p1} [] .
    ?item wdt:P31 wd:Q5 .
    OPTIONAL {{
        ?item wdt:{p2} ?missing .
    }}
    FILTER(!BOUND(?missing)) .
}}"""

    counts:dict[str, dict[str, int]] = {}
    for p1 in properties:
        results:dict[str, list[str]] = {}
        counts[p1] = {}
        for p2 in MISSING_PROPERTIES:
            results[p2] = []
            
            try:
                payload_1 = query_wdqs(sparql.format(p1=p1, p2=p2))
            except RuntimeWarning as exception:  # TODO: times out sometimes
                print(f'{exception} for {p1} and {p2}/main query')
                continue

            try:
                payload_2 = query_wdqs(sparql_count.format(p1=p1, p2=p2))
            except RuntimeWarning as exception:  # TODO: times out sometimes
                print(f'{exception} for {p1} and {p2}/count query')
                continue

            for row in payload_1:
                qid = row.get('item', {}).get('value', '')[len('http://www.wikidata.org/entity/'):]
                results[p2].append(qid)

            counts[p1][p2] = int(payload_2[0].get('cnt', {}).get('value', '0'))

        if len(counts[p1]) > 0:
            create_report(p1, results, counts[p1])

    create_summary(counts)


def read_input() -> list[str]:
    page = pwb.Page(SITE, 'Wikidata:Database reports/Humans with missing claims/input')
    text = page.get()
    lines = text.split('\n')

    return lines


def main() -> None:
    lines = read_input()

    properties:list[str] = []
    for line in lines:
        if line.startswith('#'):
            continue

        if re.match('^P\d+$', line) is None:
            continue

        properties.append(line)

    create_lists(properties)


if __name__=='__main__':
    main()