#-*- coding: utf-8 -*-
"""
Gets descriptions from Persondata templates and adds them to the
linked Wikidata item.
The list of pages to work on can be created with any page generator.
Usage:
python pwb.py descriptions -lang:en -family:wikipedia -cat:'Russian memoirists'
"""
import pywikibot
from pywikibot import pagegenerators as pg
import re
class descriptionbot:
def __init__(self, generator):
self.descriptiondictio = {"cawiki": "DESCRIPCIO CURTA",
"elwiki": u'ΜΙΚΡΗ ΠΕΡΙΓΡΑΦΗ',
"enwiki": "SHORT DESCRIPTION",
"dawiki": "KORT BESKRIVELSE",
"dewiki": "KURZBESCHREIBUNG",
"ilowiki": "ABABA A PANGILADAWAN",
"kowiki": u'짧은 설명',
"skwiki": "Popis",
"slwiki": "SHORT DESCRIPTION",
"mnwiki": "SHORT DESCRIPTION",
"nlwiki": "KORTE OMSCHRIJVING",
"viwiki": u'TÓM TẮT'}
self.repo = pywikibot.Site().data_repository()
self.persondataitem = pywikibot.ItemPage(self.repo, u"Q5153934")
self.generator = generator
# persondata on the following wikis is untranslated
# and will be skipped
self.ignorelist = ["commonswiki", "alswiki", "aswiki", "bnwiki",
"jawiki", "mlwiki", "pamwiki", "zhwiki"]
def extractpersondata(self, page, pdatatitle, langver):
pagetext = page.get()
templates = pywikibot.extract_templates_and_params(pagetext)
for (template, fielddict) in templates:
if template == pdatatitle:
description = fielddict[self.descriptiondictio[langver]]
description = self.removewikimarkup(description)
pywikibot.log("Persondata description: '%s'" % description)
return description
else:
pywikibot.log("Persondata not found in %s" % page)
def removewikimarkup(self, description):
result = ""
regex = r'([^\[\]]+)'
match = re.findall(regex, description, re.UNICODE)
for word in match:
regex = r'([^|]*)(?![^|]*\|)'
match = re.search(regex, word, re.UNICODE)
word = match.group(1)
result += word.replace("''", "")
return result
def run(self):
for page in self.generator:
if not page.exists():
continue
pywikibot.output("Processing %s" % page.title())
item = pywikibot.ItemPage.fromPage(page)
if not item.exists():
continue
itemtext = item.get()
itemsitelinks = itemtext['sitelinks'].items()
newdescription = {}
# get the article in a given language
for langver, currentpage in itemsitelinks:
if langver.endswith("wiki") and langver not in self.ignorelist:
basesite = self.repo.fromDBName(langver)
currentpage = pywikibot.Page(basesite, currentpage)
pywikibot.log("Processing %s" % currentpage)
# get the local equivalent of persondata,
# if any exists
try:
pdatatitle = self.persondataitem.get()['sitelinks'][langver]
# Clean up template title
pdatatitle = pywikibot.Page(basesite, pdatatitle,
ns=10).title(withNamespace=False)
except KeyError:
pywikibot.log("Persondata doesn't exist on %s, skipping" % langver)
continue
# get the persondata description
try:
pddescription = bot.extractpersondata(currentpage, pdatatitle, langver)
except KeyError:
pywikibot.output("The name of the description field was not found"
" for %s, please add it to the script" % langver)
# get the description in Wikidata
lang = currentpage.site.language()
if lang in itemtext['descriptions'].keys():
pywikibot.log("Wikidata description: '%s'" % itemtext['descriptions'][lang])
else:
pywikibot.log("The description is not set on Wikidata")
if pddescription:
newdescription[lang] = pddescription
if newdescription:
# add all the gathered descriptions
pywikibot.output("Adding description(s): %s" % newdescription)
item.editDescriptions(newdescription,
summary="Adding Persondata descriptions: %s" % (
', '.join("%s [%s]" % (val, key) for (
key, val) in newdescription.iteritems()
)
)
)
else:
pywikibot.output("Nothing to add")
gen = pg.GeneratorFactory()
for arg in pywikibot.handleArgs():
if gen.handleArg(arg):
continue
generator = gen.getCombinedGenerator()
if not generator:
# transcluding generator based on templateTitle
transclusionPage = pywikibot.Page(
pywikibot.Link(
"Persondata", defaultNamespace=10, source=pywikibot.Site()
)
)
generator = pywikibot.Site().page_embeddedin(
transclusionPage, filterRedirects=None,
namespaces=0, step=None, total=None, content=False
)
bot = descriptionbot(generator)
bot.run()