#!/usr/bin/python
# -*- coding: UTF-8 -*-
#based on Pasleim's other reports
import MySQLdb
import pywikibot
import time
import urllib2
import json
import re
site = pywikibot.Site('wikidata','wikidata')
repo = site.data_repository()
properties = ('P364','P1476','P577','P21', 'P26', 'P1412', 'P22', 'P551', 'P345','P161','P57','P162','P915','P58','P119','P1196','P570','P195','P170','P186','P180','P31','P105','P171') #properties for which the reports are to be created (sql below)
reportscurrentlydeactivated = ('P106', 'P735', 'P569', 'P214')
validateQ5 = ('P569', 'P21', 'P735', 'P106', 'P26', 'P214', 'P1412', 'P22','P119','P551','P1196','P570') #properties for which the sql result needs to be checked for existence of P31=Q5
validateQ11424 = ('P364','P1476','P577','P345','P161','P57','P162','P915','P58','P364') # 'P364','P1476','P577', checked for films only
validateQ16521 = ('P105','P171')
validateartwork = ('P195','P170','P186','P180') # start with CLAIM[31:3305213]
validateP31 = ('P31') #link table outdated?
validateproperty = ('P646') #link table outdated?
maxreportlength = 488
minvalue = 0
maxvalue = 0
itemlist = ''
ignoreit = ('Q4617','Q1362169','Q487612','Q4115189','Q13406268','Q15397819','Q302')
def makeReport(db, pproperty):
if pproperty == 'P31':
query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM wb_items_per_site, wb_entity_per_page WHERE epp_page_id NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title in ("P31","P279") AND pl_namespace = 120) AND epp_entity_id = ips_item_id GROUP BY epp_page_id HAVING COUNT(ips_site_id) >5 ORDER BY COUNT(ips_site_id) DESC LIMIT 900'
if pproperty == 'P646': #historic, slow/to be deactivated after initial run
query1 = ''
elif pproperty == 'P21': # sex/gender
query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, pagelinks AS pl2, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=120 AND pl1.pl_title="P31" AND pl2.pl_from_namespace=0 AND pl2.pl_namespace=0 AND pl2.pl_title="Q5" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title = "P21" AND pl_namespace = 120) AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title = "Q13406463" AND pl_namespace = 0) AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title = "Q4167836" AND pl_namespace = 0) AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title = "Q41710" AND pl_namespace = 0) AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title = "Q225469" AND pl_namespace = 0) AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title = "Q8047" AND pl_namespace = 0) AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title = "P360" AND pl_namespace = 120) AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title = "Q28640" AND pl_namespace = 0) AND epp_page_id = pl1.pl_from AND pl1.pl_from = pl2.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >2 ORDER BY COUNT(ips_site_id) DESC LIMIT 800'
elif pproperty == 'P22': # family
query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, pagelinks AS pl2, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=120 AND pl1.pl_title="P21" AND pl2.pl_from_namespace=0 AND pl2.pl_namespace=0 AND pl2.pl_title="Q5" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title in ("P22","P40","P7","P25","P9","P53","P1038","P451") AND pl_namespace = 120) AND epp_page_id = pl1.pl_from AND pl1.pl_from = pl2.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >5 ORDER BY COUNT(ips_site_id) DESC LIMIT 650'
elif pproperty == 'P106': # activities
query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, pagelinks AS pl2, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=120 AND pl1.pl_title="P21" AND pl2.pl_from_namespace=0 AND pl2.pl_namespace=0 AND pl2.pl_title="Q5" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title in ("P106","P39","P97") AND pl_namespace = 120) AND epp_page_id = pl1.pl_from AND pl1.pl_from = pl2.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >5 ORDER BY COUNT(ips_site_id) DESC LIMIT 650'
elif pproperty == 'P551': # locations
query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, pagelinks AS pl2, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=120 AND pl1.pl_title="P21" AND pl2.pl_from_namespace=0 AND pl2.pl_namespace=0 AND pl2.pl_title="Q5" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title in ("P19","P20","P551","P119","P937") AND pl_namespace = 120) AND epp_page_id = pl1.pl_from AND pl1.pl_from = pl2.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >5 ORDER BY COUNT(ips_site_id) DESC LIMIT 650'
elif pproperty == 'P569': # dates
query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, pagelinks AS pl2, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=120 AND pl1.pl_title="P21" AND pl2.pl_from_namespace=0 AND pl2.pl_namespace=0 AND pl2.pl_title="Q5" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title In ("P569","P570","P1317") AND pl_namespace = 120) AND epp_page_id = pl1.pl_from AND pl1.pl_from = pl2.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >5 ORDER BY COUNT(ips_site_id) DESC LIMIT 600'
elif pproperty == 'P735': # first name
query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, pagelinks AS pl2, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=120 AND pl1.pl_title="P21" AND pl2.pl_from_namespace=0 AND pl2.pl_namespace=0 AND pl2.pl_title="Q5" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title = "P735" AND pl_namespace = 120) AND epp_page_id = pl1.pl_from AND pl1.pl_from = pl2.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >5 ORDER BY COUNT(ips_site_id) DESC LIMIT 1000'
elif pproperty == 'P1412': # languages
query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, pagelinks AS pl2, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=120 AND pl1.pl_title="P21" AND pl2.pl_from_namespace=0 AND pl2.pl_namespace=0 AND pl2.pl_title="Q5" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title = "P1412" AND pl_namespace = 120) AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title = "P103" AND pl_namespace = 120) AND epp_page_id = pl1.pl_from AND pl1.pl_from = pl2.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >5 ORDER BY COUNT(ips_site_id) DESC LIMIT 600'
elif pproperty == 'P119': #with P1442 only
query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, pagelinks AS pl2, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=120 AND pl1.pl_title="P1442" AND pl2.pl_from_namespace=0 AND pl2.pl_namespace=0 AND pl2.pl_title="Q5" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title = "P119" AND pl_namespace = 120) AND epp_page_id = pl1.pl_from AND pl1.pl_from = pl2.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >5 ORDER BY COUNT(ips_site_id) DESC LIMIT 600'
elif pproperty == 'P26': #with P570 only
query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, pagelinks AS pl2, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=120 AND pl1.pl_title="P570" AND pl2.pl_from_namespace=0 AND pl2.pl_namespace=0 AND pl2.pl_title="Q5" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title = "P26" AND pl_namespace = 120) AND epp_page_id = pl1.pl_from AND pl1.pl_from = pl2.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >5 ORDER BY COUNT(ips_site_id) DESC LIMIT 600'
elif pproperty == 'P1196':
query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, pagelinks AS pl2, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=120 AND pl1.pl_title="P509" AND pl2.pl_from_namespace=0 AND pl2.pl_namespace=0 AND pl2.pl_title="Q5" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title in ("P1196","P1347") AND pl_namespace = 120) AND epp_page_id = pl1.pl_from AND pl1.pl_from = pl2.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >5 ORDER BY COUNT(ips_site_id) DESC LIMIT 600'
if pproperty == 'P570':
query1 = 'SELECT pp_value, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM page, wb_entity_per_page, wb_items_per_site, enwiki_p.categorylinks JOIN enwiki_p.page_props ON pp_page=cl_from WHERE ( (cl_to LIKE "1%" AND cl_to RLIKE "1[0-9]{3}_deaths") OR (cl_to LIKE "20%" AND cl_to RLIKE "20(0[0-9]|1[0-4])_deaths") ) AND pp_propname="wikibase_item" AND cl_type="page" AND pp_value NOT IN (SELECT page_title FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title in ("P570","P746") AND pl_namespace = 120) AND pp_value = page_title AND epp_entity_id = ips_item_id AND epp_entity_type = "item" AND page_id = epp_page_id AND page_namespace = 0 GROUP BY pp_value HAVING COUNT(ips_site_id) >2 ORDER BY COUNT(ips_site_id) DESC LIMIT 800'
elif pproperty == 'P214': # people
query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, pagelinks AS pl2, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=120 AND pl1.pl_title="P21" AND pl2.pl_from_namespace=0 AND pl2.pl_namespace=0 AND pl2.pl_title="Q5" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title in ("P214","P54","P641","P345","P434","P1047","P1186","P102") AND pl_namespace = 120) AND epp_page_id = pl1.pl_from AND pl1.pl_from = pl2.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >5 ORDER BY COUNT(ips_site_id) DESC LIMIT 700'
elif pproperty == 'P345': # films
query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=0 AND pl1.pl_title="Q11424" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title in ("P345","P971","P360","P301","P279") AND pl_namespace = 120) AND epp_page_id = pl1.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >1 ORDER BY COUNT(ips_site_id) DESC LIMIT 850'
elif pproperty in validateartwork:
query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=0 AND pl1.pl_title="Q3305213" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title in ("'+pproperty+'","P279") AND pl_namespace = 120) AND epp_page_id = pl1.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >1 ORDER BY COUNT(ips_site_id) DESC LIMIT 600'
elif pproperty in validateQ16521:
query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=120 AND pl1.pl_title="P225" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title in ("'+pproperty+'") AND pl_namespace = 120) AND epp_page_id = pl1.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >1 ORDER BY COUNT(ips_site_id) DESC LIMIT 550'
elif pproperty in validateQ11424:
query1 = 'SELECT CONCAT("Q",ips_item_id) as item, COUNT(ips_site_id) as cnt, GROUP_CONCAT(REPLACE(ips_site_id,"wiki","") ORDER BY ips_site_id) as sites FROM pagelinks AS pl1, wb_items_per_site, wb_entity_per_page WHERE pl1.pl_from_namespace=0 AND pl1.pl_namespace=0 AND pl1.pl_title="Q11424" AND pl1.pl_from NOT IN (SELECT pl_from FROM wikidatawiki_p.page JOIN wikidatawiki_p.pagelinks ON page_id=pl_from WHERE pl_title in ("'+pproperty+'","P279") AND pl_namespace = 120) AND epp_page_id = pl1.pl_from AND epp_entity_id = ips_item_id GROUP BY pl1.pl_from HAVING COUNT(ips_site_id) >1 ORDER BY COUNT(ips_site_id) DESC LIMIT 850'
cursor = db.cursor()
cursor.execute(query1)
text = ''
table_row = '{{{{TR missing P|{0}|{1}|{2}|{3}}}}}\n'
reportlength = 0
skipped = 0
for q, cnt, sites in cursor:
skipped +=1
data = json.load(urllib2.urlopen('https://www.wikidata.org/w/api.php?action=wbgetentities&ids=%s&props=labels|claims&format=json' % q))
if pproperty in validateQ5:
if 'claims' in data['entities'][q]:
if 'P279' in data['entities'][q]['claims']:
validity = 0
continue
if 'P31' in data['entities'][q]['claims']:
validity = 0
for m in data['entities'][q]['claims']['P31']:
if m['mainsnak']['datavalue']['value']['numeric-id'] == 5:
validity = 1
if validity == 0:
continue
elif pproperty in validateQ11424:
if 'claims' in data['entities'][q]:
if 'P31' in data['entities'][q]['claims']:
validity = 0
for m in data['entities'][q]['claims']['P31']:
if m['mainsnak']['datavalue']['value']['numeric-id'] == 11424:
validity = 1
if validity == 0:
continue
elif pproperty in validateartwork:
if 'claims' in data['entities'][q]:
if 'P31' in data['entities'][q]['claims']:
validity = 0
for m in data['entities'][q]['claims']['P31']:
if m['mainsnak']['datavalue']['value']['numeric-id'] == 3305213:
validity = 1
if validity == 0:
continue
elif pproperty in validateQ16521:
if 'claims' in data['entities'][q]:
if 'P31' in data['entities'][q]['claims']:
validity = 0
for m in data['entities'][q]['claims']['P31']:
if m['mainsnak']['datavalue']['value']['numeric-id'] == 16521:
validity = 1
if validity == 0:
continue
elif pproperty in validateP31:
validity = 1
if 'claims' in data['entities'][q]:
if 'P31' in data['entities'][q]['claims']:
validity = 0
if 'P279' in data['entities'][q]['claims']:
validity = 0
elif pproperty in validateproperty:
validity = 1
if 'claims' in data['entities'][q]:
if pproperty in data['entities'][q]['claims']:
validity = 0
else:
validity = 1
if (q in ignoreit) == True:
validity = 0
if validity == 1:
label = q
if 'labels' in data['entities'][q]:
if 'en' in data['entities'][q]['labels']:
label = data['entities'][q]['labels']['en']['value']
elif (sites.split(",")[0]) in data['entities'][q]['labels']:
label = data['entities'][q]['labels'][(sites.split(",")[0])]['value']
if (((' ' in label) == True) or (pproperty != 'P735')):
if (",fr," in sites) == True: sites= "fr," + sites.replace(",fr,", ",", 1)
if (",commons," in sites) == True: sites= "commons," + sites.replace(",commons,", ",", 1)
if (",en," in sites) == True: sites= "en," + sites.replace(",en,", ",", 1)
sites = (sites[:68] + '..') if len(sites) > 68 else sites
text += table_row.format(q,label.encode('utf-8'),cnt,sites)
reportlength +=1
skipped -=1
if reportlength == 1:
maxvalue = cnt
itemlist = q.replace("Q", "", 1)
else:
itemlist = itemlist + q.replace("Q", ",", 1)
if reportlength == maxreportlength:
break
minvalue = cnt
return text, maxvalue, minvalue, reportlength, skipped, itemlist
def wikisite(sit):
if sit in ('commonswiki', 'specieswiki', 'metawiki'):
group = 'wikimedia'
sdomain = sit.split('wiki')[0]
elif sit in ('wikidatawiki', 'mediawikiwiki'):
sdomain = 'www'
group = sit.split('wiki')[0]
else:
sdomain, group = sit.split('wik')
sdomain = sdomain.replace('_', '-')
if sdomain == 'nb':
sdomain = 'no'
group = 'wik'+group
if group == 'wiki':
group = 'wikipedia'
return sdomain, group
def makeNoClaimReport(db, sit, maxrlength, table_row, mincount, numberofclaims, reportmode):
# FIXME: if numberofclaims>0 limit to pages in namespace 0 (article)
if reportmode == "cat":
hiddencat = ""
if sit in ("frwiki"):
#condition to be inserted in query1
hiddencat = ' cl_to NOT IN ( SELECT page_title FROM ' + sit + '_p.page, ' + sit + '_p.page_props WHERE page_namespace = 14 AND pp_page = page_id AND pp_propname = "hiddencat" ) AND '
query1 = 'SELECT c.cl_to As cat, c.cnt As cnt, d.pp_value As QID FROM (SELECT cl_to, COUNT(*) AS cnt FROM ' + sit + '_p.categorylinks as cl, ' + sit + '_p.page_props As wpage_props WHERE ' + hiddencat + ' cl_from = wpage_props.pp_page AND wpage_props.pp_propname = "wikibase_item" AND wpage_props.pp_page = cl.cl_from AND cl_type = "page" AND wpage_props.pp_value IN ( SELECT page_title FROM page, page_props, wb_entity_per_page, wb_items_per_site WHERE ips_site_id = "' + sit + '" AND pp_page = epp_page_id AND pp_propname = "wb-claims" AND pp_value = "'+str(numberofclaims)+'" AND epp_entity_id = ips_item_id AND epp_entity_type = "item" AND page_id = pp_page) GROUP BY cl_to HAVING COUNT(*)>' + str(mincount) + ' ORDER BY COUNT(*) DESC) as c LEFT JOIN (SELECT page_title, pp_value FROM ' + sit + '_p.page, ' + sit + '_p.page_props WHERE page_namespace = 14 AND pp_page = page_id AND pp_propname = "wikibase_item" ) as d ON c.cl_to = d.page_title'
else:
query1 = 'SELECT REPLACE(tl_title, "_", " ") As tmpl, COUNT(*) As cnt, "" as QID FROM ' + sit + '_p.templatelinks, ' + sit + '_p.page_props As wpage_props WHERE tl_from = wpage_props.pp_page AND wpage_props.pp_propname = "wikibase_item" AND wpage_props.pp_page = tl_from AND wpage_props.pp_value IN ( SELECT page_title FROM page, page_props, wb_entity_per_page, wb_items_per_site WHERE ips_site_id = "' + sit + '" AND pp_page = epp_page_id AND pp_propname = "wb-claims" AND pp_sortkey = '+str(numberofclaims)+' AND epp_entity_id = ips_item_id AND epp_entity_type = "item" AND page_id = pp_page) GROUP BY tl_title HAVING COUNT(tl_title)>' + str(mincount) + ' ORDER BY COUNT(tl_title) DESC'
reportlength = 0
cursor = db.cursor()
cursor.execute(query1)
text = ''
for cat, cnt, qid in cursor:
lang, group = wikisite(sit)
tresult = 1
if sit == "enwiki":
ca = ("Wikipedia_", "Articles_", "All_", "Use_", "Orphaned_", "CS1_", "Pages_", "Cleanup_tagged", "EngvarB_from", "Uncategorized_from")
for c in ca:
if cat.startswith(c):
tresult = 0
elif sit == "itwiki":
p = re.match('Senza_fonti_-_.*_\d\d\d\d',cat)
if p:
tresult = 0
elif sit == "plwiki":
p = re.match('Artyku.y_wymagaj.ce_uzupe.nienia_.r.de._od_\d\d\d\d.*',cat)
if p:
tresult = 0
elif sit == "jawiki":
p = re.match(u'出典を必要とする記事.*',cat)
if p:
tresult = 0
elif sit == "ptwiki":
p = re.match('!Artigos_que_carecem_de_fontes_desde.*',cat)
if p:
tresult = 0
p = re.match('!Artigos_sem_interwiki_desde_.*',cat)
if p:
tresult = 0
if tresult == 1:
reportlength +=1
text += table_row.format(cat.replace("_"," "), cnt, lang, group, qid, reportlength)
if reportlength == maxrlength:
break
return text, reportlength
def main():
db = MySQLdb.connect(host="wikidatawiki.labsdb",db="wikidatawiki_p", read_default_file="~/replica.my.cnf")
noclaims = ("cywiki", "iswiki", "ocwiki", "ltwiki", "nowiki", "ptwiki", "frwiki", "svwiki", "rowiki", "lvwiki", "nlwiki", "ruwiki", "ukwiki", "plwiki", "cswiki", "skwiki", "enwiki", "dewiki", "itwiki", "zhwiki", "eswiki", "fiwiki", "huwiki", "trwiki", "specieswiki", "dawiki", "eowiki", "etwiki", "cawiki", "nnwiki", "jawiki")
for sit in noclaims:
table_row = '{{{{TR noclaims site|{0}|{1}|{2}|{3}|{4}|{5}}}}}\n'
maxrlength = 2000
mincount = 50
numberofclaims = 0
reportname = "Wikidata:Database reports/items without claims categories/"
if (sit in ("cywiki", "iswiki", "ocwiki", "ltwiki", "rowiki", "lvwiki", "nlwiki", "cswiki", "skwiki", "specieswiki", "huwiki", "cawiki", "nnwiki", "eowiki")): mincount = 10
if (sit in ("enwiki")): maxrlength = 3500
report, reportlength = makeNoClaimReport(db,sit,maxrlength,table_row,mincount,numberofclaims, "cat")
page = pywikibot.Page(site, reportname + sit)
stat = '{{DR noclaims|site=%s|reportlength=%s|mincount=%s}}\n' %(sit, reportlength, mincount)
header = '{{{{' + reportname +'header|{0}}}}}<onlyinclude>\n'
footer = '</onlyinclude>{{' + reportname + 'footer}} __NOINDEX__'
text = stat + header.format(time.strftime("%Y-%m-%d %H:%M (%Z)")) + report + footer
summary = 'Bot: Updating database report: site %s; reportlength: %s; mincount: %s' %(sit, reportlength, mincount)
page.put(text.decode('UTF-8'),comment=summary,minorEdit=False)
noclaims = ("cywiki", "iswiki", "ocwiki", "ltwiki", "nowiki", "itwiki")
for sit in noclaims:
table_row = '{{{{TR noclaims site|{0}|{1}|{2}|{3}|{4}|{5}}}}}\n'
maxrlength = 2000
mincount = 50
numberofclaims = 1
reportname = "Wikidata:Database reports/items with " + str(numberofclaims) + " statement categories/"
if (sit in ("cywiki", "iswiki", "ocwiki", "ltwiki", "rowiki", "lvwiki", "nlwiki", "cswiki", "skwiki")): mincount = 10
if (sit in ("frwiki")): mincount = 30
if (sit in ("enwiki")): maxrlength = 3500
report, reportlength = makeNoClaimReport(db,sit,maxrlength,table_row,mincount,numberofclaims, "cat")
page = pywikibot.Page(site, reportname + sit)
stat = '{{DR noclaims|site=%s|reportlength=%s|mincount=%s}}\n' %(sit, reportlength, mincount)
header = '{{{{' + reportname + 'header|{0}}}}}<onlyinclude>\n'
footer = '</onlyinclude>{{' + reportname + 'footer}} __NOINDEX__'
text = stat + header.format(time.strftime("%Y-%m-%d %H:%M (%Z)")) + report + footer
summary = 'Bot: Updating database report: site %s; reportlength: %s; mincount: %s' %(sit, reportlength, mincount)
page.put(text.decode('UTF-8'),comment=summary,minorEdit=False)
noclaimstemplate = ("rowiki", "lvwiki", "nlwiki", "ruwiki", "ukwiki", "cswiki", "skwiki", "dewiki", "svwiki", "enwiki", "frwiki", "ptwiki", "zhwiki", "jawiki", "cawiki")
# noclaimstemplate = ("cywiki", "iswiki", "ltwiki", "ptwiki", "rowiki", "lvwiki", "nlwiki", "ruwiki", "ukwiki", "cswiki", "skwiki", "dewiki")
for sit in noclaimstemplate:
table_row = '{{{{TR noclaimstemplate site|{0}|{1}|{2}|{3}}}}}\n'
maxrlength = 2000
mincount = 50
numberofclaims = 0
reportname = "Wikidata:Database reports/templates and items with 0 claims/"
if (sit in ("cywiki", "iswiki", "ocwiki", "ltwiki", "rowiki", "lvwiki", "svwiki", "nlwiki", "jawiki")): mincount = 10
report, reportlength = makeNoClaimReport(db, sit, maxrlength, table_row, mincount, numberofclaims, "template")
page = pywikibot.Page(site, reportname + sit)
stat = '{{DR noclaims|site=%s|reportlength=%s|mincount=%s|numberofclaims=%s}}\n' %(sit, reportlength, mincount, numberofclaims)
header = '{{{{'+reportname+'header|{0}}}}}<onlyinclude>\n'
footer = '</onlyinclude>{{'+reportname+'footer}} __NOINDEX__'
text = stat + header.format(time.strftime("%Y-%m-%d %H:%M (%Z)")) + report + footer
summary = 'Bot: Updating database report: site %s; reportlength: %s; mincount: %s; numberofclaims: %s' %(sit, reportlength, mincount, numberofclaims)
page.put(text.decode('UTF-8'),comment=summary,minorEdit=False)
noclaimstemplate = ("frwiki", "ptwiki", "zhwiki")
for sit in noclaimstemplate:
table_row = '{{{{TR 1 claimstemplate site|{0}|{1}|{2}|{3}}}}}\n'
maxrlength = 2000
mincount = 50
numberofclaims = 1
reportname = "Wikidata:Database reports/templates and items with 1 claim/"
if (sit in ("cywiki", "iswiki", "ocwiki", "ltwiki", "rowiki", "lvwiki", "svwiki", "nlwiki")): mincount = 10
report, reportlength = makeNoClaimReport(db, sit, maxrlength, table_row, mincount, numberofclaims, "template")
page = pywikibot.Page(site, reportname + sit)
stat = '{{DR noclaims|site=%s|reportlength=%s|mincount=%s|numberofclaims=%s}}\n' %(sit, reportlength, mincount, numberofclaims)
header = '{{{{'+reportname+'header|{0}}}}}<onlyinclude>\n'
footer = '</onlyinclude>{{'+reportname+'footer}} __NOINDEX__'
text = stat + header.format(time.strftime("%Y-%m-%d %H:%M (%Z)")) + report + footer
summary = 'Bot: Updating database report: site %s; reportlength: %s; mincount: %s; numberofclaims: %s' %(sit, reportlength, mincount, numberofclaims)
page.put(text.decode('UTF-8'),comment=summary,minorEdit=False)
for pproperty in properties:
report, maxvalue, minvalue, reportlength, skipped, itemlist = makeReport(db,pproperty)
stat = '{{DR missing properties|property=%s|max=%s|min=%s|reportlength=%s|skipped=%s}}\n' %(pproperty, maxvalue, minvalue, reportlength, skipped)
page = pywikibot.Page(site,'Wikidata:Database reports/top missing properties by number of sitelinks/'+pproperty)
header = '{{{{Wikidata:Database reports/top missing properties by number of sitelinks/header|{0}|itemlist='+ itemlist +'}}}}<onlyinclude>\n'
footer = '</onlyinclude>{{Wikidata:Database reports/top missing properties by number of sitelinks/footer}} __NOINDEX__'
text = stat + header.format(time.strftime("%Y-%m-%d %H:%M (%Z)")) + report + footer
summary = 'Bot: Updating database report: property %s; max: %s; min: %s; reportlength: %s; skipped: %s' %(pproperty, maxvalue, minvalue, reportlength, skipped)
page.put(text.decode('UTF-8'),comment=summary,minorEdit=False)
if __name__ == "__main__":
main()