User:Underlying lk/one to many.py

#!/usr/bin/python
# -*- coding: utf-8 -*-

from __future__ import unicode_literals

"""
Usage:

python harvest_template.py -lang:nl -template:"Taxobox straalvinnige" orde P70 familie P71 geslacht P74

This will work on all pages that transclude the template in the article
namespace

You can use any typical pagegenerator to provide with a list of pages:

python harvest_template.py -lang:nl -cat:Sisoridae -template:"Taxobox straalvinnige" -namespace:0 orde P70 familie P71 geslacht P74

&params;
"""
#
# (C) Multichill, Amir, 2013
# (C) Pywikibot team, 2013
#
# Distributed under the terms of MIT License.
#
__version__ = '$Id: 2507544f311b7164e04c7c83198a891f33e9f8ee $'
#

import re
import json
import pywikibot
from pywikibot import pagegenerators as pg
from datetime import datetime
from datetime import timedelta

docuReplacements = {'&params;': pywikibot.pagegenerators.parameterHelp}


class HarvestRobot:
    """
    A bot to add Wikidata claims
    """
    def __init__(self, generator, templateTitle, fields, overwrite=False):
        """
        Arguments:
            * generator     - A generator that yields Page objects.
            * templateTitle - The template to work on
            * fields        - A dictionary of fields that are of use to us
            * overwrite     - if existing claims should be overwritten

        """
        self.generator = generator
        self.templateTitle = templateTitle.replace(u'_', u' ')
        # TODO: Make it a list which also includes the redirects to the template
        self.fields = fields
        self.overwrite = overwrite
        self.repo = pywikibot.Site().data_repository()
        self.cacheSources()

    def getSource(self, site):
        """
        Get the source for the specified site,
        if possible
        """
        if site.family.name in self.source_values and site.code in self.source_values[site.family.name]:
            source = pywikibot.Claim(self.repo, 'P143')
            source.setTarget(self.source_values.get(site.family.name).get(site.code))
            return source

    def cacheSources(self):
        """
        Fetches the sources from the onwiki list
        and stores it internally
        """
        page = pywikibot.Page(self.repo, u'List of wikis/python', ns=4)
        self.source_values = json.loads(page.get())
        for family_code, family in self.source_values.iteritems():
            for source_lang in family:
                self.source_values[family_code][source_lang] = pywikibot.ItemPage(self.repo,
                                                                                  family[source_lang])

    def run(self):
        """
        Starts the robot.
        """
        self.templateTitles = self.getTemplateSynonyms(self.templateTitle)
        for i, page in enumerate(self.generator):
            try:
                self.procesPage(i, page)
            except Exception as e:
                pywikibot.exception(tb=True)

    def newItem(self, page, item):
        """
        Create item where none exists (from newitem.py by Multichill)
        """
        self.pageAge = 21
        self.pageAgeBefore = self.repo.getcurrenttime() - timedelta(days=self.pageAge)
        self.lastEdit = 7
        self.lastEditBefore = self.repo.getcurrenttime() - timedelta(days=self.lastEdit)

        if page.isRedirectPage():
            pywikibot.output('%s is a redirect page. Skipping.' % page)
        elif page.namespace() == 2:
            pywikibot.output('%s is a user page. Skipping.' % page)
        elif page.editTime() > self.lastEditBefore:
            pywikibot.output('Last edit on %s was on %s. Too recent. Skipping.' % (page, page.editTime().isoformat()))
        else:
            (revId, revTimestamp, revUser, revComment) = page.getVersionHistory(reverseOrder=True, total=1)[0]
            if revTimestamp > self.pageAgeBefore:
                pywikibot.output('Page creation of %s on %s is too recent. Skipping.' % (page, page.editTime().isoformat()))
            elif page.langlinks():
                # FIXME: Implement this
                pywikibot.output('Found language links (interwiki links). Haven\'t implemented that yet so skipping.')
            else:
                # FIXME: i18n
                summary = u'Bot: New item with sitelink from %s' % (page.title(asLink=True, insite=self.repo), )

                data = {'sitelinks':
                        {item.getdbName(page.site):
                         {'site': item.getdbName(page.site),
                          'title': page.title()}
                         },
                        'labels':
                        {page.site.lang:
                         {'language': page.site.lang,
                          'value': page.title()}
                         }
                        }
                pywikibot.output(summary)
                item.editEntity(data, summary=summary)

    def getTemplateSynonyms(self, title):
        """
        Fetches redirects of the title, so we can check against them
        """
        pywikibot.output('Finding redirects...')  # Put some output here since it can take a while
        temp = pywikibot.Page(pywikibot.Site(), title, ns=10)
        if temp.isRedirectPage():
            temp = temp.getRedirectTarget()
        titles = [page.title(withNamespace=False)
                  for page
                  in temp.getReferences(redirectsOnly=True, namespaces=[10], follow_redirects=False)]
        titles.append(temp.title(withNamespace=False))
        return titles

    def extractUrl(self, page, value, claim):
        """
        Extract url datatype from field
        """
        match = re.search("(http[^|\s\]\}]+)", value)  # try format {{URL|htp://www.example.com}} or [http://www.example.com Example]
        if match is None:
            match = re.search("{{\w*\|(www\S+)}}", value)  # try format {{URL|www.example.com}}
            if match is None:
                match = re.search("{{\w*\|(\S+)}}", value)  # try format {{URL|example.com}}
                extractedUrl = "http://www." + match.group(1)
            else:
                extractedUrl = "http://" + match.group(1)
        else:
            extractedUrl = match.group(1)
        claim.setTarget(extractedUrl)
        pywikibot.output('Extracting %s --> %s' % (value, extractedUrl))

    def monthToNumber(self, page, month):
        """
        Returns an integer from the month name
        """
        # return if the month is already int
        try:
            month = int(month)
            return month
        except ValueError:
            pass
        # try formats from date.py
        month_dictio = {}
        languageEdition = page.site.language()
        getMonthInt = pywikibot.date.getAutoFormat
        if getMonthInt(languageEdition, month)[1]:
            month = getMonthInt(languageEdition, month)[1]
        else:
            if languageEdition == "cs":   # Česky (Czech)
                month_dictio = {"ledna": "1", "února": "2", "března": "3", "dubna": "4",
                                "května": "5", "června": "6", "července": "7", "srpna": "8",
                                "října": "10", "listopadu": "11", "prosince": "12"}
            elif languageEdition == "fr":   # Français
                month_dictio = {"mars": "3"}
            elif languageEdition == "ru":   # Russian
                month_dictio = {"января": "1", "февраля": "2", "марта": "3", "апреля": "4",
                                "мая": "5", "июня": "6", "июля": "7", "августа": "8",
                                "сентября": "9", "октября": "10", "ноября": "11", "декабря": "12"}
            for monthName, monthNumber in month_dictio.items():
                if month == monthName:
                    month = month.replace(monthName, monthNumber)
                    break
        try:
            month = int(month)
        except ValueError:
            print month + " is not a valid month"
            month = None
        return month

    def extractTime(self, page, value, claim):
        """
        Extract time from field
        """
        extractedMonth = None
        extractedDay = None
        regexDict = {"\{\{(?:[dD]ate|[fF]echa|[dD]ni|[fF]alec).*?\|(\d{1,2})\|(\w*)\|(\d{1,4})": "ddMMyyyy",  # templates, format 1 January 2000
                     "(\d{1,2})\.?\s(\w*)\]{0,2},?\s\[{0,2}(\d{1,4})": "ddMMyyyy",                  # format 1 January 2000
                     "(\d{1,2})[\.\/](\w*)[\.\/](\d{3,4})": "ddMMyyyy",                             # format 01.01.2000
                     "(\d{3,4})[\.\/-](\w*)[\.\/-](\d{1,2})": "yyyymmdd",                           # format 2000-01-01
                     "(\d{1,2})\sde\s(\w*)\]{0,2}\sde\s\[{0,2}(\d{1,4})": "ddMMyyyy",               # format [[1 de enero]] de [[2000]]
                     "(\w*)\.?\s(\d{1,2}),\s(\d{1,4})": "MMddyyyy",                                 # format January 1, 2000
                     "\{\{\w(?:irth|eath|tart|ilm).*?(\d{1,4})[\|\}](?:\w*=)?(?:\d{3,4}|(\d{0,2}))[\|\}](?:\w*=)?(\d{0,2})": "yyyymmdd",    # English templates, format 2000 01 01
                     "([\d,]+)(?:\s*|&nbsp;)(?:BC|bc|av. J)": "yyyy BC"}                            # years BC, format 1000 BCE
        for regex, timeFormat in regexDict.items():
            match = re.search(regex, value, re.UNICODE)
            if match is not None:
                if timeFormat == "ddMMyyyy":  # day, month name, year
                    extractedMonth = self.monthToNumber(page, match.group(2))
                    if extractedMonth is None:
                        match = None
                        continue
                    else:
                        extractedYear = int(match.group(3))
                        extractedDay = int(match.group(1))
                        break
                elif timeFormat == "MMddyyyy":  # month name, day, year
                    extractedMonth = self.monthToNumber(page, match.group(1))
                    if extractedMonth is None:
                        match = None
                        continue
                    else:
                        extractedYear = int(match.group(3))
                        extractedDay = int(match.group(2))
                        break
                elif timeFormat == "yyyymmdd":  # year, month, day
                    if match.group(2):
                        extractedMonth = int(match.group(2))
                    if match.group(3):
                        extractedDay = int(match.group(3))
                    extractedYear = int(match.group(1))
                    break
                elif timeFormat == "yyyy BC":  # year BCE
                    extractedYear = "-" + match.group(1).replace(",", "")  # remove commas, if any
                    break
        if match is None:
            match = re.search("^\[{0,2}(-?\d{1,4})(?!\d*(st|nd|rd|th))", value)  # last resort
            extractedYear = match.group(1)
        timeclaim = pywikibot.WbTime(year=extractedYear, month=extractedMonth, day=extractedDay)
        claim.setTarget(timeclaim)
        pywikibot.output('Extracting %s --> %s-%s-%s' % (value, extractedYear, extractedMonth, extractedDay))

    def procesPage(self, index, page):
        """
        Proces a single page
        """
        item = pywikibot.ItemPage.fromPage(page)
        pywikibot.output(u'Processing No. %s: %s' % (index, page))
        if not item.exists():
            # create the page
            self.newItem(page, item)
            item = pywikibot.ItemPage.fromPage(page)
            if not item.exists():
                # The item was not created
                return
        pagetext = page.get()
        templates = pywikibot.extract_templates_and_params(pagetext)
        for (template, fielddict) in templates:
            # Clean up template
            template = pywikibot.Page(page.site, template,
                                      ns=10).title(withNamespace=False)
            # We found the template we were looking for
            if template in self.templateTitles:
                for field, value in fielddict.items():
                    field = field.strip()
                    value = value.strip()
                    # This field contains something useful for us
                    if field in self.fields:
                        claim = pywikibot.Claim(self.repo, self.fields[field])
                        # Check if the property isn't already set

                        
                        if self.overwrite is False:
                            if claim.getID() in item.get().get('claims'):
                                pywikibot.output(
                                    u'%s already exists (-overwrite to change it)'
                                    % claim.getID())
                                continue
                                # TODO FIXME: This is a very crude way of dupe
                                # checking
                        if str(item.getVersionHistory(total=50)).find("wbremoveclaims-remove:1| */ [[Property:" + claim.getID()) is not -1:
                            pywikibot.output('%s cannot be added as it was recently removed from the item' % (claim.getID(),))
                        else:                        
                            match = re.findall("\[\[(?!(?:File:|Image:))([^|]*?)(?:\|.*?)?\]\]", value)
                            match2 = re.findall("\{\{[fF]lag(?!icon).*?\|([^|]*?)(?:\|.*?)?\}\}", value)
                            for additional in match2:
                                match.insert(0, additional)
                            for value in match:
                                value = "[[" + value + "]]"
                                if claim.getType() == 'wikibase-item':
                                    # Try to extract a valid page
                                    match = re.search(pywikibot.link_regex, value)
                                    if match is None:
                                        pywikibot.output('No valid item found for %s' % (claim.getID(),))
                                        continue
                                    else:
                                        try:
                                            link = pywikibot.Link(match.group(1))
                                            linkedPage = pywikibot.Page(link)
                                            if linkedPage.isRedirectPage():
                                                linkedPage = linkedPage.getRedirectTarget()
                                            linkedItem = pywikibot.ItemPage.fromPage(linkedPage)
                                            claim.setTarget(linkedItem)
                                            if not linkedItem.title():
                                                print " "  # this is only to raise NoPage
                                            if linkedPage.isDisambig():  # avoid adding disambiguation pages
                                                pywikibot.output('%s is a disambiguation page. Skipping.' % (linkedPage,))
                                                continue
                                        except pywikibot.exceptions.NoPage:
                                            pywikibot.output('%s doesn\'t exist so I can\'t link to it' % (linkedPage,))
                                            continue
                                elif claim.getType() == 'string':
                                    if value == "":
                                        pywikibot.output('No valid string found for %s' % (claim.getID(),))
                                        continue
                                    else:
                                        match = re.search("^(?!(?:\{|\[http))\W*([^<>]+)(?<![<>])", value, re.UNICODE)  # avoid tags, linka and templates
                                        if match:
                                            value = match.group(1)
                                            claim.setTarget(value.strip())
                                        else:
                                            pywikibot.output('%s includes forbidden characters' % (value,))
                                            continue
                                elif claim.getType() == 'url':
                                    try:
                                        self.extractUrl(page, value, claim)
                                    except AttributeError:
                                        pywikibot.output('No valid URL found for %s' % (claim.getID(),))
                                        continue
                                elif claim.getType() == 'time':
                                    try:
                                        self.extractTime(page, value, claim)
                                    except AttributeError:
                                        pywikibot.output('No valid time for %s' % (claim.getID(),))
                                        continue
                                elif claim.getType() == 'commonsMedia':
                                    mediasite = pywikibot.Site("commons", "commons")
                                    if value == "":
                                        pywikibot.output('No valid media file found for %s' % (claim.getID(),))
                                        continue
                                    else:
                                        match = re.search("\w{4,5}:([^|\]]*)", value)  # extract filename
                                        if match is not None:
                                            value = "File:" + match.group(1)
                                        elif not value.startswith("File:"):
                                            value = "File:" + value
                                        # check if the image exists on Commons
                                        image = pywikibot.ImagePage(mediasite, value)
                                        if image.exists() is False:
                                            pywikibot.output('%s does not exist on Commons' % (value,))
                                            continue
                                        else:
                                            claim.setTarget(image)
                                else:
                                    pywikibot.output("%s is not a supported datatype." % claim.getType())
                                    continue
                                alreadythere = False
                                try:
                                    for existingclaim in item.get().get('claims')[self.fields[field]]:
                                        if claim.getTarget() == existingclaim.getTarget():
                                            print ("%s already exists with %s" %
                                                   (existingclaim.getID(), existingclaim.getTarget()))
                                            alreadythere = True
                                            break
                                except KeyError:
                                    pass
                                if alreadythere:
                                    continue

                                pywikibot.output('Adding %s --> %s' % (claim.getID(), claim.getTarget()))
                                
                                item.addClaim(claim)
                                # A generator might yield pages from multiple sites
                                source = self.getSource(page.site)
                                if source:
                                    claim.addSource(source, bot=True)
                                

def main():
    gen = pg.GeneratorFactory()
    commandline_arguments = list()
    templateTitle = u''
    overwrite = False
    for arg in pywikibot.handleArgs():
        if arg.startswith('-template'):
            if len(arg) == 9:
                templateTitle = pywikibot.input(
                    u'Please enter the template to work on:')
            else:
                templateTitle = arg[10:]
        elif arg.startswith('-overwrite'):
            overwrite = True
        elif gen.handleArg(arg):
            continue
        else:
            commandline_arguments.append(arg)

    if len(commandline_arguments) % 2 or not templateTitle:
        raise ValueError  # or something.
    fields = dict()

    for i in range(0, len(commandline_arguments), 2):
        fields[commandline_arguments[i]] = commandline_arguments[i + 1]

    generator = gen.getCombinedGenerator()
    if not generator:
        # transcluding generator based on templateTitle
        transclusionPage = pywikibot.Page(
            pywikibot.Link(
                templateTitle, defaultNamespace=10, source=pywikibot.Site()
            )
        )
        generator = pywikibot.Site().page_embeddedin(
            transclusionPage, filterRedirects=None,
            namespaces=0, step=None, total=None, content=False
        )

    bot = HarvestRobot(generator, templateTitle, fields, overwrite)
    bot.run()

if __name__ == "__main__":
    main()