User:Underlying lk/harvest

# -*- coding: utf-8 -*-

from __future__ import unicode_literals


python -lang:nl -template:"Taxobox straalvinnige" orde P70 familie P71 geslacht P74

This will work on all pages that transclude the template in the article

You can use any typical pagegenerator to provide with a list of pages:

python -lang:nl -cat:Sisoridae -template:"Taxobox straalvinnige" -namespace:0 orde P70 familie P71 geslacht P74

# (C) Multichill, Amir, 2013
# (C) Pywikibot team, 2013
# Distributed under the terms of MIT License.
__version__ = '$Id: 2507544f311b7164e04c7c83198a891f33e9f8ee $'

import re
import json
import pywikibot
from pywikibot import pagegenerators as pg
from datetime import datetime
from datetime import timedelta
from pywikibot import wdhelper

docuReplacements = {'&params;': pywikibot.pagegenerators.parameterHelp}

class HarvestRobot:
    A bot to add Wikidata claims
    def __init__(self, generator, templateTitle, fields, overwrite=False):
            * generator     - A generator that yields Page objects.
            * templateTitle - The template to work on
            * fields        - A dictionary of fields that are of use to us
            * overwrite     - if existing claims should be overwritten

        self.generator = generator
        self.templateTitle = templateTitle.replace(u'_', u' ')
        # TODO: Make it a list which also includes the redirects to the template
        self.fields = fields
        self.overwrite = overwrite
        self.repo = pywikibot.Site().data_repository()

    def getSource(self, site):
        Get the source for the specified site,
        if possible
        if in self.source_values and site.code in self.source_values[]:
            source = pywikibot.Claim(self.repo, 'P143')
            return source

    def cacheSources(self):
        Fetches the sources from the onwiki list
        and stores it internally
        page = pywikibot.Page(self.repo, u'List of wikis/python', ns=4)
        self.source_values = json.loads(page.get())
        for family_code, family in self.source_values.iteritems():
            for source_lang in family:
                self.source_values[family_code][source_lang] = pywikibot.ItemPage(self.repo,

    def run(self):
        Starts the robot.
        self.templateTitles = self.getTemplateSynonyms(self.templateTitle)
        for i, page in enumerate(self.generator):
                self.procesPage(i, page)
            except Exception as e:

    def newItem(self, page, item):
        Create item where none exists (from by Multichill)
        self.pageAge = 21
        self.pageAgeBefore = self.repo.getcurrenttime() - timedelta(days=self.pageAge)
        self.lastEdit = 7
        self.lastEditBefore = self.repo.getcurrenttime() - timedelta(days=self.lastEdit)

        if page.isRedirectPage():
            pywikibot.output('%s is a redirect page. Skipping.' % page)
        elif page.namespace() == 2:
            pywikibot.output('%s is a user page. Skipping.' % page)
        elif page.editTime() > self.lastEditBefore:
            pywikibot.output('Last edit on %s was on %s. Too recent. Skipping.' % (page, page.editTime().isoformat()))
            (revId, revTimestamp, revUser, revComment) = page.getVersionHistory(reverseOrder=True, total=1)[0]
            if revTimestamp > self.pageAgeBefore:
                pywikibot.output('Page creation of %s on %s is too recent. Skipping.' % (page, page.editTime().isoformat()))
            elif page.langlinks():
                # FIXME: Implement this
                pywikibot.output('Found language links (interwiki links). Haven\'t implemented that yet so skipping.')
                # FIXME: i18n
                summary = u'Bot: New item with sitelink from %s' % (page.title(asLink=True, insite=self.repo), )

                data = {'sitelinks':
                         {'site': item.getdbName(,
                          'title': page.title()}
                          'value': page.title()}
                item.editEntity(data, summary=summary)

    def getTemplateSynonyms(self, title):
        Fetches redirects of the title, so we can check against them
        pywikibot.output('Finding redirects...')  # Put some output here since it can take a while
        temp = pywikibot.Page(pywikibot.Site(), title, ns=10)
        if temp.isRedirectPage():
            temp = temp.getRedirectTarget()
        titles = [page.title(withNamespace=False)
                  for page
                  in temp.getReferences(redirectsOnly=True, namespaces=[10], follow_redirects=False)]
        return titles

    def procesPage(self, index, page):
        Process a single page
        item = pywikibot.ItemPage.fromPage(page)
        pywikibot.output(u'Processing No. %s: %s' % (index, page))
        if not item.exists():
            # create the page
            self.newItem(page, item)
            item = pywikibot.ItemPage.fromPage(page)
            if not item.exists():
                # The item was not created
        pagetext = page.get()
        templates = pywikibot.extract_templates_and_params(pagetext)
        for (template, fielddict) in templates:
            # Clean up template
            template = pywikibot.Page(, template,
            # We found the template we were looking for
            if template in self.templateTitles:
                for field, value in fielddict.items():
                    field = field.strip()
                    if not field in self.fields:
                    # This field contains something useful for us
                    value = value.strip()
                    pid = self.fields[field]
                    # Check if the property isn't already set
                    if self.overwrite is False:
                        if pid in item.get().get('claims'):
                                u'%s already exists (-overwrite to change it)'
                                % pid)
                            # TODO FIXME: This is a very crude way of dupe
                            # checking
                    if str(item.getVersionHistory(total=50)).find("wbremoveclaims-remove:1| */ [[Property:" + pid) is not -1:
                        pywikibot.output('%s cannot be added as it was recently removed from the item' % (pid,))
                        claim = wdhelper.matchDatatype(pid, value, page, item)
                        if claim is None:

                        if claim.getType() == 'quantity':
                            # temporary solution until quantities are properly supported
                            source = self.getSource(
                            if source:
                                claim.addSource(source, bot=True)

                        if claim.getID() in item.get().get('claims'):
                            # overwrite
                            claimToChange =[claim.getID()][0]
                            valueToChange = claimToChange.getTarget()
                            valueNew = claim.getTarget()
                            # if the new and old claims are the same
                            if valueToChange != valueNew:
                                pywikibot.output('Changing %s --> %s' % (valueToChange, valueNew))
                                pywikibot.output('Old value %s same as new value %s' % (valueToChange, valueNew))
                            pywikibot.output('Adding %s --> %s' % (claim.getID(), claim.getTarget()))
                            # A generator might yield pages from multiple sites
                            source = self.getSource(
                            if source:
                                claim.addSource(source, bot=True)

def main():
    gen = pg.GeneratorFactory()
    commandline_arguments = list()
    templateTitle = u''
    overwrite = False
    for arg in pywikibot.handleArgs():
        if arg.startswith('-template'):
            if len(arg) == 9:
                templateTitle = pywikibot.input(
                    u'Please enter the template to work on:')
                templateTitle = arg[10:]
        elif arg.startswith('-overwrite'):
            overwrite = True
        elif gen.handleArg(arg):

    if len(commandline_arguments) % 2 or not templateTitle:
        raise ValueError  # or something.
    fields = dict()

    for i in range(0, len(commandline_arguments), 2):
        fields[commandline_arguments[i]] = commandline_arguments[i + 1]

    generator = gen.getCombinedGenerator()
    if not generator:
        # transcluding generator based on templateTitle
        transclusionPage = pywikibot.Page(
                templateTitle, defaultNamespace=10, source=pywikibot.Site()
        generator = pywikibot.Site().page_embeddedin(
            transclusionPage, filterRedirects=None,
            namespaces=0, step=None, total=None, content=False

    bot = HarvestRobot(generator, templateTitle, fields, overwrite)

if __name__ == "__main__":