User:Underlying lk/harvest template old.py

#!/usr/bin/python
# -*- coding: utf-8 -*-

from __future__ import unicode_literals

"""
Usage:

python harvest_template.py -lang:nl -template:"Taxobox straalvinnige" orde P70 familie P71 geslacht P74

This will work on all pages that transclude the template in the article
namespace

You can use any typical pagegenerator to provide with a list of pages:

python harvest_template.py -lang:nl -cat:Sisoridae -template:"Taxobox straalvinnige" -namespace:0 orde P70 familie P71 geslacht P74

&params;
"""
#
# (C) Multichill, Amir, 2013
# (C) Pywikibot team, 2013
#
# Distributed under the terms of MIT License.
#
__version__ = '$Id: 2507544f311b7164e04c7c83198a891f33e9f8ee $'
#

import re
import json
import pywikibot
from pywikibot import pagegenerators as pg
from datetime import datetime
from datetime import timedelta

docuReplacements = {'&params;': pywikibot.pagegenerators.parameterHelp}


class HarvestRobot:
    """
    A bot to add Wikidata claims
    """
    def __init__(self, generator, templateTitle, fields, overwrite=False):
        """
        Arguments:
            * generator     - A generator that yields Page objects.
            * templateTitle - The template to work on
            * fields        - A dictionary of fields that are of use to us
            * overwrite     - if existing claims should be overwritten

        """
        self.generator = generator
        self.templateTitle = templateTitle.replace(u'_', u' ')
        # TODO: Make it a list which also includes the redirects to the template
        self.fields = fields
        self.overwrite = overwrite
        self.repo = pywikibot.Site().data_repository()
        self.cacheSources()

    def getSource(self, site):
        """
        Get the source for the specified site,
        if possible
        """
        if site.family.name in self.source_values and site.code in self.source_values[site.family.name]:
            source = pywikibot.Claim(self.repo, 'P143')
            source.setTarget(self.source_values.get(site.family.name).get(site.code))
            return source

    def cacheSources(self):
        """
        Fetches the sources from the onwiki list
        and stores it internally
        """
        page = pywikibot.Page(self.repo, u'List of wikis/python', ns=4)
        self.source_values = json.loads(page.get())
        for family_code, family in self.source_values.iteritems():
            for source_lang in family:
                self.source_values[family_code][source_lang] = pywikibot.ItemPage(self.repo,
                                                                                  family[source_lang])

    def run(self):
        """
        Starts the robot.
        """
        self.templateTitles = self.getTemplateSynonyms(self.templateTitle)
        for i, page in enumerate(self.generator):
            try:
                self.procesPage(i, page)
            except Exception as e:
                pywikibot.exception(tb=True)

    def newItem(self, page, item):
        """
        Create item where none exists (from newitem.py by Multichill)
        """
        self.pageAge = 21
        self.pageAgeBefore = self.repo.getcurrenttime() - timedelta(days=self.pageAge)
        self.lastEdit = 7
        self.lastEditBefore = self.repo.getcurrenttime() - timedelta(days=self.lastEdit)

        if page.isRedirectPage():
            pywikibot.output('%s is a redirect page. Skipping.' % page)
        elif page.namespace() == 2:
            pywikibot.output('%s is a user page. Skipping.' % page)
        elif page.editTime() > self.lastEditBefore:
            pywikibot.output('Last edit on %s was on %s. Too recent. Skipping.' % (page, page.editTime().isoformat()))
        else:
            (revId, revTimestamp, revUser, revComment) = page.getVersionHistory(reverseOrder=True, total=1)[0]
            if revTimestamp > self.pageAgeBefore:
                pywikibot.output('Page creation of %s on %s is too recent. Skipping.' % (page, page.editTime().isoformat()))
            elif page.langlinks():
                # FIXME: Implement this
                pywikibot.output('Found language links (interwiki links). Haven\'t implemented that yet so skipping.')
            else:
                # FIXME: i18n
                summary = u'Bot: New item with sitelink from %s' % (page.title(asLink=True, insite=self.repo), )

                data = {'sitelinks':
                        {item.getdbName(page.site):
                         {'site': item.getdbName(page.site),
                          'title': page.title()}
                         },
                        'labels':
                        {page.site.lang:
                         {'language': page.site.lang,
                          'value': page.title()}
                         }
                        }
                pywikibot.output(summary)
                item.editEntity(data, summary=summary)

    def getTemplateSynonyms(self, title):
        """
        Fetches redirects of the title, so we can check against them
        """
        pywikibot.output('Finding redirects...')  # Put some output here since it can take a while
        temp = pywikibot.Page(pywikibot.Site(), title, ns=10)
        if temp.isRedirectPage():
            temp = temp.getRedirectTarget()
        titles = [page.title(withNamespace=False)
                  for page
                  in temp.getReferences(redirectsOnly=True, namespaces=[10], follow_redirects=False)]
        titles.append(temp.title(withNamespace=False))
        return titles

    def extractUrl(self, page, value, claim):
        """
        Extract url datatype from field
        """
        match = re.search("(http[^|\s\]\}]+)", value)  # try format {{URL|htp://www.example.com}} or [http://www.example.com Example]
        if match is None:
            match = re.search("{{\w*\|(www\S+)}}", value)  # try format {{URL|www.example.com}}
            if match is None:
                match = re.search("{{\w*\|(\S+)}}", value)  # try format {{URL|example.com}}
                extractedUrl = "http://www." + match.group(1)
            else:
                extractedUrl = "http://" + match.group(1)
        else:
            extractedUrl = match.group(1)
        claim.setTarget(extractedUrl)
        pywikibot.output('Extracting %s --> %s' % (value, extractedUrl))

    def monthToNumber(self, page, month):
        """
        Returns an integer from the month name
        """
        # return if the month is already int
        try:
            month = int(month)
            return month
        except ValueError:
            pass
        # try formats from date.py
        month_dictio = {}
        languageEdition = page.site.language()
        getMonthInt = pywikibot.date.getAutoFormat
        if getMonthInt(languageEdition, month)[1]:
            month = getMonthInt(languageEdition, month)[1]
        else:
            if languageEdition == "cs":   # Česky (Czech)
                month_dictio = {"ledna": "1", "února": "2", "března": "3", "dubna": "4",
                                "května": "5", "června": "6", "července": "7", "srpna": "8",
                                "října": "10", "listopadu": "11", "prosince": "12"}
            elif languageEdition == "fr":   # Français
                month_dictio = {"mars": "3"}
            elif languageEdition == "ru":   # Russian
                month_dictio = {"января": "1", "февраля": "2", "марта": "3", "апреля": "4",
                                "мая": "5", "июня": "6", "июля": "7", "августа": "8",
                                "сентября": "9", "октября": "10", "ноября": "11", "декабря": "12"}
            for monthName, monthNumber in month_dictio.items():
                if month == monthName:
                    month = month.replace(monthName, monthNumber)
                    break
        try:
            month = int(month)
        except ValueError:
            print month + " is not a valid month"
            month = None
        return month

    def extractTime(self, page, value, claim):
        """
        Extract time from field
        """
        extractedMonth = None
        extractedDay = None
        regexDict = {"\{\{(?:[dD]ate|[fF]echa|[dD]ni|[fF]alec).*?\|(\d{1,2})\|(\w*)\|(\d{1,4})": "ddMMyyyy",  # templates, format 1 January 2000
                     "(\d{1,2})\.?\s(\w*)\]{0,2},?\s\[{0,2}(\d{1,4})": "ddMMyyyy",                  # format 1 January 2000
                     "(\d{1,2})[\.\/](\w*)[\.\/](\d{3,4})": "ddMMyyyy",                             # format 01.01.2000
                     "^(?:(?!access).)*?(\d{3,4})[\.\/-](\w*)[\.\/-](\d{1,2})": "yyyymmdd",          # format 2000-01-01
                     "(\d{1,2})\sde\s(\w*)\]{0,2}\sde\s\[{0,2}(\d{1,4})": "ddMMyyyy",               # format [[1 de enero]] de [[2000]]
                     "(\w*)\.?\s(\d{1,2}),\s(\d{1,4})": "MMddyyyy",                                 # format January 1, 2000
                     "\{\{\w(?:irth|eath|tart|ilm).*?(\d{1,4})[\|\}](?:\w*=)?(?:\d{3,4}|(\d{0,2}))[\|\}](?:\w*=)?(\d{0,2})": "yyyymmdd",    # English templates, format 2000 01 01
                     "([\d,]+)(?:\s*|&nbsp;)(?:BC|bc|av. J)": "yyyy BC"}                            # years BC, format 1000 BCE
        for regex, timeFormat in regexDict.items():
            match = re.search(regex, value, re.UNICODE)
            if match is not None:
                if timeFormat == "ddMMyyyy":  # day, month name, year
                    extractedMonth = self.monthToNumber(page, match.group(2))
                    if extractedMonth is None:
                        match = None
                        continue
                    else:
                        extractedYear = int(match.group(3))
                        extractedDay = int(match.group(1))
                        break
                elif timeFormat == "MMddyyyy":  # month name, day, year
                    extractedMonth = self.monthToNumber(page, match.group(1))
                    if extractedMonth is None:
                        match = None
                        continue
                    else:
                        extractedYear = int(match.group(3))
                        extractedDay = int(match.group(2))
                        break
                elif timeFormat == "yyyymmdd":  # year, month, day
                    if match.group(2):
                        extractedMonth = int(match.group(2))
                    if match.group(3):
                        extractedDay = int(match.group(3))
                    extractedYear = int(match.group(1))
                    break
                elif timeFormat == "yyyy BC":  # year BCE
                    extractedYear = "-" + match.group(1).replace(",", "")  # remove commas, if any
                    break
        if match is None:
            match = re.search("^\[{0,2}(-?\d{1,4})(?!\d*(st|nd|rd|th))", value)  # last resort
            extractedYear = match.group(1)
        timeclaim = pywikibot.WbTime(year=extractedYear, month=extractedMonth, day=extractedDay)
        claim.setTarget(timeclaim)
        pywikibot.output('Extracting %s --> %s-%s-%s' % (value, extractedYear, extractedMonth, extractedDay))

    def extractQuantity(self, item, claimproperty, value):
        """
        some basic support for quantities
        data is added directly through the api
        overwrite will not work for them
        """
        item = pywikibot.ItemPage(self.repo, item.title())
        regex = r'(?:^|[\s\-–~>:])((?:(?<!\d)-?)\d[\d,\.\'\s]*)(\smillion)?(?:[\s<+/\}]|$)'
        match = re.search(regex, value)
        if match is None:
            print value
            pywikibot.output("No valid quantity found")
            return
        number = match.group(1).replace(",", "")
        number = number.replace("'", "")
        number = number.replace(" ", "")
        if match.group(2):
            number = str(int((float(number) * 1000000)))
        if not number.startswith("-"):
            number = "+" + number
        pywikibot.output('Extracting %s --> %s' % (value, number))
        quantity = {"amount": number,
                    "unit": "1",
                    "upperBound": number,
                    "lowerBound": number
                    }
        params = {'action': 'wbcreateclaim',
                  'entity': item.title(),
                  'property': claimproperty,
                  'snaktype': 'value',
                  'value': json.dumps(quantity),
                  'bot': 1,
                  'token': self.repo.token(item, 'edit')
                  }
        req = pywikibot.data.api.Request(site=self.repo, **params)
        pywikibot.output('Adding %s --> %s' % (claimproperty, quantity))
        data = req.submit()
        item.get()
        lastclaim = len(item.claims[claimproperty]) - 1
        return item.claims[claimproperty][lastclaim]

    def procesPage(self, index, page):
        """
        Proces a single page
        """
        item = pywikibot.ItemPage.fromPage(page)
        pywikibot.output(u'Processing No. %s: %s' % (index, page))
        if not item.exists():
            # create the page
            self.newItem(page, item)
            item = pywikibot.ItemPage.fromPage(page)
            if not item.exists():
                # The item was not created
                return
        pagetext = page.get()
        templates = pywikibot.extract_templates_and_params(pagetext)
        for (template, fielddict) in templates:
            # Clean up template
            template = pywikibot.Page(page.site, template,
                                      ns=10).title(withNamespace=False)
            # We found the template we were looking for
            if template in self.templateTitles:
                for field, value in fielddict.items():
                    field = field.strip()
                    if not field in self.fields:
                        continue
                    # This field contains something useful for us
                    value = value.strip()
                    claim = pywikibot.Claim(self.repo, self.fields[field])
                    # Check if the property isn't already set
                    if self.overwrite is False:
                        if claim.getID() in item.get().get('claims'):
                            pywikibot.output(
                                u'%s already exists (-overwrite to change it)'
                                % claim.getID())
                            continue
                            # TODO FIXME: This is a very crude way of dupe
                            # checking
                    if str(item.getVersionHistory(total=50)).find("wbremoveclaims-remove:1| */ [[Property:" + claim.getID()) is not -1:
                        pywikibot.output('%s cannot be added as it was recently removed from the item' % (claim.getID(),))
                    else:
                        if claim.getType() == 'wikibase-item':
                            # Try to extract a valid page
                            match = re.search(pywikibot.link_regex, value)
                            if match is None:
                                pywikibot.output('No valid item found for %s' % (claim.getID(),))
                                continue
                            else:
                                try:
                                    link = pywikibot.Link(match.group(1))
                                    linkedPage = pywikibot.Page(link)
                                    if linkedPage.isRedirectPage():
                                        linkedPage = linkedPage.getRedirectTarget()
                                    linkedItem = pywikibot.ItemPage.fromPage(linkedPage)
                                    claim.setTarget(linkedItem)
                                    if not linkedItem.title():
                                        print " "  # this is only to raise NoPage
                                    if linkedPage.isDisambig():  # avoid adding disambiguation pages
                                        pywikibot.output('%s is a disambiguation page. Skipping.' % (linkedPage,))
                                        continue
                                except pywikibot.exceptions.NoPage:
                                    pywikibot.output('%s doesn\'t exist so I can\'t link to it' % (linkedPage,))
                                    continue
                        elif claim.getType() == 'string':
                            if value == "":
                                pywikibot.output('No valid string found for %s' % (claim.getID(),))
                                continue
                            else:
                                match = re.search("^(?!(?:\{|\[http))\W*([^<>]+)(?<![<>])", value, re.UNICODE)  # avoid tags, linka and templates
                                if match:
                                    value = match.group(1)
                                    claim.setTarget(value.strip())
                                else:
                                    pywikibot.output('%s includes forbidden characters' % (value,))
                                    continue
                        elif claim.getType() == 'url':
                            try:
                                self.extractUrl(page, value, claim)
                            except AttributeError:
                                pywikibot.output('No valid URL found for %s' % (claim.getID(),))
                                continue
                        elif claim.getType() == 'time':
                            try:
                                self.extractTime(page, value, claim)
                            except AttributeError:
                                pywikibot.output('No valid time for %s' % (claim.getID(),))
                                continue
                        elif claim.getType() == 'commonsMedia':
                            mediasite = pywikibot.Site("commons", "commons")
                            if value == "":
                                pywikibot.output('No valid media file found for %s' % (claim.getID(),))
                                continue
                            else:
                                match = re.search("\w{4,5}:([^|\]]*)", value)  # extract filename
                                if match is not None:
                                    value = "File:" + match.group(1)
                                elif not value.startswith("File:"):
                                    value = "File:" + value
                                # check if the image exists on Commons
                                image = pywikibot.ImagePage(mediasite, value)
                                if image.exists() is False:
                                    pywikibot.output('%s does not exist on Commons' % (value,))
                                    continue
                                else:
                                    claim.setTarget(image)
                        elif claim.getType() == 'quantity':
                            # temporary solution until quantities are properly supported
                            claim = self.extractQuantity(item, claim.getID(), value)
                            if not claim:
                                continue
                            source = self.getSource(page.site)
                            if source:
                                claim.addSource(source, bot=True)
                            continue
                        else:
                            pywikibot.output("%s is not a supported datatype." % claim.getType())
                            continue

                        if claim.getID() in item.get().get('claims'):
                            # overwrite
                            item.get()
                            claimToChange = item.claims[claim.getID()][0]
                            valueToChange = claimToChange.getTarget()
                            valueNew = claim.getTarget()
                            # if the new and old claims are the same
                            if valueToChange != valueNew:
                                pywikibot.output('Changing %s --> %s' % (valueToChange, valueNew))
                                claimToChange.changeTarget(valueNew)
                            else:
                                pywikibot.output('Old value %s same as new value %s' % (valueToChange, valueNew))
                        else:
                            pywikibot.output('Adding %s --> %s' % (claim.getID(), claim.getTarget()))
                            item.addClaim(claim)
                            # A generator might yield pages from multiple sites
                            source = self.getSource(page.site)
                            if source:
                                claim.addSource(source, bot=True)


def main():
    gen = pg.GeneratorFactory()
    commandline_arguments = list()
    templateTitle = u''
    overwrite = False
    for arg in pywikibot.handleArgs():
        if arg.startswith('-template'):
            if len(arg) == 9:
                templateTitle = pywikibot.input(
                    u'Please enter the template to work on:')
            else:
                templateTitle = arg[10:]
        elif arg.startswith('-overwrite'):
            overwrite = True
        elif gen.handleArg(arg):
            continue
        else:
            commandline_arguments.append(arg)

    if len(commandline_arguments) % 2 or not templateTitle:
        raise ValueError  # or something.
    fields = dict()

    for i in range(0, len(commandline_arguments), 2):
        fields[commandline_arguments[i]] = commandline_arguments[i + 1]

    generator = gen.getCombinedGenerator()
    if not generator:
        # transcluding generator based on templateTitle
        transclusionPage = pywikibot.Page(
            pywikibot.Link(
                templateTitle, defaultNamespace=10, source=pywikibot.Site()
            )
        )
        generator = pywikibot.Site().page_embeddedin(
            transclusionPage, filterRedirects=None,
            namespaces=0, step=None, total=None, content=False
        )

    bot = HarvestRobot(generator, templateTitle, fields, overwrite)
    bot.run()

if __name__ == "__main__":
    main()