User:Underlying lk/descriptions.py

#-*- coding: utf-8  -*-
"""
Gets descriptions from Persondata templates and adds them to the
linked Wikidata item.

The list of pages to work on can be created with any page generator.

Usage:
python pwb.py descriptions -lang:en -family:wikipedia -cat:'Russian memoirists'

"""
import pywikibot
from pywikibot import pagegenerators as pg
import re


class descriptionbot:
    def __init__(self, generator):
        self.descriptiondictio = {"cawiki": "DESCRIPCIO CURTA",
                                  "elwiki": u'ΜΙΚΡΗ ΠΕΡΙΓΡΑΦΗ',
                                  "enwiki": "SHORT DESCRIPTION",
                                  "dawiki": "KORT BESKRIVELSE",
                                  "dewiki": "KURZBESCHREIBUNG",
                                  "ilowiki": "ABABA A PANGILADAWAN",
                                  "kowiki": u'짧은 설명',
                                  "skwiki": "Popis",
                                  "slwiki": "SHORT DESCRIPTION",
                                  "mnwiki": "SHORT DESCRIPTION",
                                  "nlwiki": "KORTE OMSCHRIJVING",
                                  "viwiki": u'TÓM TẮT'}
        self.repo = pywikibot.Site().data_repository()
        self.persondataitem = pywikibot.ItemPage(self.repo, u"Q5153934")
        self.generator = generator
        # persondata on the following wikis is untranslated
        # and will be skipped
        self.ignorelist = ["commonswiki", "alswiki", "aswiki", "bnwiki",
                           "jawiki", "mlwiki", "pamwiki", "zhwiki"]

    def extractpersondata(self, page, pdatatitle, langver):

        pagetext = page.get()
        templates = pywikibot.extract_templates_and_params(pagetext)
        for (template, fielddict) in templates:
            if template == pdatatitle:
                description = fielddict[self.descriptiondictio[langver]]
                description = self.removewikimarkup(description)
                pywikibot.log("Persondata description: '%s'" % description)
                return description
        else:
            pywikibot.log("Persondata not found in %s" % page)

    def removewikimarkup(self, description):
        result = ""
        regex = r'([^\[\]]+)'
        match = re.findall(regex, description, re.UNICODE)
        for word in match:
            regex = r'([^|]*)(?![^|]*\|)'
            match = re.search(regex, word, re.UNICODE)
            word = match.group(1)
            result += word.replace("''", "")
        return result

    def run(self):
        for page in self.generator:
            if not page.exists():
                continue
            pywikibot.output("Processing %s" % page.title())
            item = pywikibot.ItemPage.fromPage(page)
            if not item.exists():
                continue
            itemtext = item.get()
            itemsitelinks = itemtext['sitelinks'].items()
            newdescription = {}
            # get the article in a given language
            for langver, currentpage in itemsitelinks:
                if langver.endswith("wiki") and langver not in self.ignorelist:
                    basesite = self.repo.fromDBName(langver)
                    currentpage = pywikibot.Page(basesite, currentpage)
                    pywikibot.log("Processing %s" % currentpage)
                    # get the local equivalent of persondata,
                    # if any exists
                    try:
                        pdatatitle = self.persondataitem.get()['sitelinks'][langver]
                        # Clean up template title
                        pdatatitle = pywikibot.Page(basesite, pdatatitle,
                                                    ns=10).title(withNamespace=False)
                    except KeyError:
                        pywikibot.log("Persondata doesn't exist on %s, skipping" % langver)
                        continue
                    # get the persondata description
                    try:
                        pddescription = bot.extractpersondata(currentpage, pdatatitle, langver)
                    except KeyError:
                        pywikibot.output("The name of the description field was not found"
                                         " for %s, please add it to the script" % langver)
                    # get the description in Wikidata
                    lang = currentpage.site.language()
                    if lang in itemtext['descriptions'].keys():
                        pywikibot.log("Wikidata description: '%s'" % itemtext['descriptions'][lang])
                    else:
                        pywikibot.log("The description is not set on Wikidata")
                        if pddescription:
                            newdescription[lang] = pddescription
            if newdescription:
                # add all the gathered descriptions
                pywikibot.output("Adding description(s): %s" % newdescription)
                item.editDescriptions(newdescription,
                                      summary="Adding Persondata descriptions: %s" % (
                                          ', '.join("%s [%s]" % (val, key) for (
                                              key, val) in newdescription.iteritems()
                                          )
                                      )
                                      )
            else:
                pywikibot.output("Nothing to add")

gen = pg.GeneratorFactory()
for arg in pywikibot.handleArgs():
    if gen.handleArg(arg):
        continue
generator = gen.getCombinedGenerator()
if not generator:
    # transcluding generator based on templateTitle
    transclusionPage = pywikibot.Page(
        pywikibot.Link(
            "Persondata", defaultNamespace=10, source=pywikibot.Site()
        )
    )
    generator = pywikibot.Site().page_embeddedin(
        transclusionPage, filterRedirects=None,
        namespaces=0, step=None, total=None, content=False
    )

bot = descriptionbot(generator)
bot.run()