# Import modules
import pywikibot
from pywikibot import pagegenerators
from pywikibot.data import api
import numpy as np
import requests
from importlib import reload
import re
site = pywikibot.Site("wikidata", "wikidata")
repo = site.data_repository()
enwp_site = 'enwiki'
prefix = 'en'
def search_entities(site, itemtitle):
params = {'action': 'wbsearchentities',
'format': 'json',
'language': 'en',
'type': 'item',
'search': itemtitle}
request = api.Request(site=site, parameters=params)
return request.submit()
def findCorrectQID(wikidataEntries, text):
if wikidataEntries['search'] != []:
results = wikidataEntries['search']
numresults = len(results)
for i in range(0, numresults):
qid = results[i]['id']
label = results[i]['label']
curr_url = results[i]['title']
desc = results[i]['description']
match = results[i]['match']
if match['language'] == 'en' and match['type'] == 'label' and match['text'] == text and desc != 'Wikimedia disambiguation page' : #checking if language is english, type is label and text is exactly same as the searched text
print(qid + "- " + label + "\nDescription: " + str(desc) +"\n") #printing QID, label and Description for the term searched for in Wikidata
list_of_text = ["BSD", "free operating system", 'software', "Matthew Dillon", "FreeBSD", "x86_64", "FreeBSD ports"]
list_of_text2 = ["free software", "Microsoft Windows"]
for text in list_of_text:
wikidataEntries = search_entities(repo, text)
findCorrectQID(wikidataEntries, text)
""""For Unconnected pages"""
#Following https://bitbucket.org/mikepeel/wikicode/src/master/enwp_find_wikidata.py
def unconnected_pages(query):
enwp = pywikibot.Site('en', 'wikipedia')
enwd = pywikibot.Site('wikidata', 'wikidata')
targetcats = ['Category:Articles_without_Wikidata_item']
for targetcat in targetcats:
cat = pywikibot.Category(enwp, targetcat)
# pages = pagegenerators.CategorizedPageGenerator(cat, recurse=False)
pages = enwp.querypage('UnconnectedPages')
for page in pages:
if query == page.title():
print("\n" + "http://en.wikipedia.org/wiki/"+page.title().replace(' ','_'))
if 'Articles for deletion' in page.title():
continue
if page.isRedirectPage():
continue
try:
wd_item = pywikibot.ItemPage.fromPage(page)
item_dict = wd_item.get()
qid = wd_item.title()
print("Has a sitelink already - " + qid)
continue
except:
# If that didn't work, go no further
print(page.title() + ' - no page found')
wd_item = 0
item_dict = 0
qid = 0
sitelink_check = 0
# continue
# If we're here, then we don't have one, see if we can add it through the commons category
searchtag = page.title()
try:
searchtag = searchtag.split('(')[0].strip()
except:
null = 0
wikidataEntries = search_entities(repo, searchtag)
# print(wikidataEntries)
data = {'sitelinks': [{'site': enwp_site, 'title': page.title()}]}
# print(wikidataEntries['searchinfo'])
done = 0
if wikidataEntries['search'] != []:
results = wikidataEntries['search']
# prettyPrint(results)
numresults = len(results)
if numresults > 5:
print('More than 5 candidates, bot would skip')
for i in range(0,numresults):
if done != 0:
continue
targetpage = pywikibot.ItemPage(site, results[i]['id'])
try:
item_dict = targetpage.get()
except:
continue
# print(item_dict)
sitelink = ''
try:
sitelink = get_sitelink_title(item_dict['sitelinks'][enwp_site])
except:
null = 0
if sitelink == '':
print('http://www.wikidata.org/wiki/'+results[i]['id'])
try:
print(item_dict['labels']['en'])
except:
print('')
try:
print(item_dict['descriptions']['en'])
except:
print('')
print('http://'+prefix+'.wikipedia.org/wiki/' + page.title().replace(' ','_'))
text = input("Save? ")
if text != 'n':
targetpage.editEntity(data, summary=u'Add enwp sitelink')
done = 1
unconnected_pages('Anna Palmer') #Search for "Anna Palmer" in Unconnected Pages - will find sitelink for this
unconnected_pages("Central Council for Research in Homoeopathy (CCRH)") #Search for "Central Council for Research in Homoeopathy (CCRH)" in Unconnected Pages - will not find in page, have an option to save if link is found