User:Underlying lk/claimit.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This script adds claims to Wikidata items based on categories.

------------------------------------------------------------------------------

Usage:

python claimit.py [pagegenerators] P1 Q2 P123 Q456

You can use any typical pagegenerator to provide with a list of pages.
Then list the property-->target pairs to add.

------------------------------------------------------------------------------

For geographic coordinates:

python claimit.py [pagegenerators] P625 [lat-dec],[long-dec],[prec]

[lat-dec] and [long-dec] represent the latitude and longitude respectively,
and [prec] represents the precision. All values are in decimal degrees,
not DMS. If [prec] is omitted, the default precision is 0.0001 degrees.

Example:

python claimit.py [pagegenerators] P625 -23.3991,-52.0910,0.0001

------------------------------------------------------------------------------

By default, claimit.py does not add a claim if one with the same property
already exists on the page. To override this behavior, use the 'exists' option:

python claimit.py [pagegenerators] P246 "string example" -exists:p

Suppose the claim you want to add has the same property as an existing claim
and the "-exists:p" argument is used. Now, claimit.py will not add the claim
if it has the same target, sources, and/or qualifiers as the existing claim.
To override this behavior, add 't' (target), 's' (sources), or 'q' (qualifiers)
to the 'exists' argument.

For instance, to add the claim to each page even if one with the same
property, target, and qualifiers already exists:

python claimit.py [pagegenerators] P246 "string example" -exists:ptq

Note that the ordering of the letters in the 'exists' argument does not matter,
but 'p' must be included.

"""
#
# (C) Legoktm, 2013
# (C) Pywikibot team, 2013
#
# Distributed under the terms of the MIT license.
#
__version__ = '$Id: db8ff17f23c0c40c6efae64d5eb8d93c3a7c81b5 $'
#

import json
import re
import pywikibot
from pywikibot import pagegenerators
from datetime import datetime
from datetime import timedelta


class ClaimRobot:
    """
    A bot to add Wikidata claims
    """
    def __init__(self, generator, claims, exists_arg=''):
        """
        Arguments:
            * generator    - A generator that yields Page objects.
            * claims       - A list of wikidata claims
            * exists_arg   - String specifying how to handle duplicate claims

        """
        self.generator = generator
        self.claims = claims
        self.exists_arg = exists_arg
        self.repo = pywikibot.Site().data_repository()
        self.cacheSources()

    def getSource(self, lang):
        """
        Get the source for the specified language,
        if possible
        """
        if lang in self.source_values:
            source = pywikibot.Claim(self.repo, 'p143')
            source.setTarget(self.source_values.get(lang))
            return source

    def cacheSources(self):
        """
        Fetches the sources from the onwiki list
        and stores it internally
        """
        page = pywikibot.Page(self.repo, u'Wikidata:List of wikis/python')
        self.source_values = json.loads(page.get())
        self.source_values = self.source_values['wikipedia']
        for source_lang in self.source_values:
            self.source_values[source_lang] = pywikibot.ItemPage(self.repo,
                                                                 self.source_values[source_lang])

    def newItem(self, page, item):
        """
        Create item where none exists (from newitem.py by Multichill)
        """
        self.pageAge = 21
        self.pageAgeBefore = self.repo.getcurrenttime() - timedelta(days=self.pageAge)
        self.lastEdit = 7
        self.lastEditBefore = self.repo.getcurrenttime() - timedelta(days=self.lastEdit)

        if page.isRedirectPage():
            pywikibot.output('%s is a redirect page. Skipping.' % page)
        elif page.editTime() > self.lastEditBefore:
            pywikibot.output('Last edit on %s was on %s. Too recent. Skipping.' % (page, page.editTime().isoformat()))
        else:
            (revId, revTimestamp, revUser, revComment) = page.getVersionHistory(reverseOrder=True, total=1)[0]
            if revTimestamp > self.pageAgeBefore:
                pywikibot.output('Page creation of %s on %s is too recent. Skipping.' % (page, page.editTime().isoformat()))
            elif page.langlinks():
                # FIXME: Implement this
                pywikibot.output('Found language links (interwiki links). Haven\'t implemented that yet so skipping.')
            else:
                # FIXME: i18n
                summary = u'Bot: New item with sitelink from %s' % (page.title(asLink=True, insite=self.repo), )

                data = {'sitelinks':
                        {item.getdbName(page.site):
                         {'site': item.getdbName(page.site),
                          'title': page.title()}
                         },
                        'labels':
                        {page.site.lang:
                         {'language': page.site.lang,
                          'value': page.title()}
                         }
                        }
                pywikibot.output(summary)
                item.editEntity(data, summary=summary)

    def run(self):
        """
        Starts the robot.
        """
        if self.exists_arg:
            pywikibot.output('\'exists\' argument set to \'%s\'' % self.exists_arg)
        for page in self.generator:
            pywikibot.output('Processing %s' % page)
            item = pywikibot.ItemPage.fromPage(page)
            if not item.exists():
                # create the page
                self.newItem(page, item)
                item = pywikibot.ItemPage.fromPage(page)
                if not item.exists():
                    # The item was not created
                    continue
            for claim in self.claims:
                skip = False
                # If claim with same property already exists...
                if claim.getID() in item.claims:
                    if self.exists_arg is None or 'p' not in self.exists_arg:
                        pywikibot.log('Skipping %s because claim with same property already exists' % (claim.getID(),))
                        pywikibot.log('Use the -exists:p option to override this behavior')
                        skip = True
                    else:
                        existing_claims = item.claims[claim.getID()]  # Existing claims on page of same property
                        for existing in existing_claims:
                            skip = True  # Default value
                            # If some attribute of the claim being added matches some attribute in an existing claim
                            # of the same property, skip the claim, unless the 'exists' argument overrides it.
                            if claim.getTarget() == existing.getTarget() and 't' not in self.exists_arg:
                                pywikibot.log('Skipping %s because claim with same target already exists' % (claim.getID(),))
                                pywikibot.log('Append \'t\' to the -exists argument to override this behavior')
                                break
                            if listsEqual(claim.getSources(), existing.getSources()) and 's' not in self.exists_arg:
                                pywikibot.log('Skipping %s because claim with same sources already exists' % (claim.getID(),))
                                pywikibot.log('Append \'s\' to the -exists argument to override this behavior')
                                break
                            if listsEqual(claim.qualifiers, existing.qualifiers) and 'q' not in self.exists_arg:
                                pywikibot.log('Skipping %s because claim with same qualifiers already exists' % (claim.getID(),))
                                pywikibot.log('Append \'q\' to the -exists argument to override this behavior')
                                break
                            skip = False
                if not skip:
                    # check if the bot was reverted recently
                    revhist = pywikibot.data.api.Request(site=self.repo, action="query", titles=item.getID(), prop="revisions", rvprop="comment", rvlimit="35")
                    revisions_text = str(revhist.submit())
                    match = re.search("remove.{9,9}Property:" + claim.getID(), revisions_text)
                    if match:
                        pywikibot.output('%s cannot be added as it was recently removed from the item' % (claim.getID(),))
                    else: 
                        pywikibot.output('Adding %s --> %s'
                                         % (claim.getID(), claim.getTarget()))
                        item.addClaim(claim)
                        # A generator might yield pages from multiple languages
                        source = self.getSource(page.site.language())
                        if source:
                            claim.addSource(source, bot=True)
                        # TODO FIXME: We need to check that we aren't adding a
                        # duplicate
                else:
                    pywikibot.output('%s is already set' % (claim.getID()))


def listsEqual(list1, list2):
    """
    Returns true if the lists are probably equal, ignoring order.
    Works for lists of unhashable items (like dictionaries).
    """
    if len(list1) != len(list2):
        return False
    if sorted(list1) != sorted(list2):
        return False
    for item in list1:
        if not item in list2:
            return False
    return True


def main():
    exists_arg = ''
    gen = pagegenerators.GeneratorFactory()
    commandline_claims = list()
    for arg in pywikibot.handleArgs():
        # Handle args specifying how to handle duplicate claims
        if arg.startswith('-exists:'):
            exists_arg = arg.split(':')[1].strip('"')
            continue
        # Handle page generator args
        if gen.handleArg(arg):
            continue
        commandline_claims.append(arg)
    if len(commandline_claims) % 2:
        raise ValueError  # or something.

    claims = list()
    repo = pywikibot.Site().data_repository()
    for i in range(0, len(commandline_claims), 2):
        claim = pywikibot.Claim(repo, commandline_claims[i])
        if claim.getType() == 'wikibase-item':
            target = pywikibot.ItemPage(repo, commandline_claims[i + 1])                                       
        elif claim.getType() == 'time':
            target = pywikibot.WbTime(year= commandline_claims[i + 1])
        elif claim.getType() == 'string':
            target = commandline_claims[i + 1]
        elif claim.getType() == 'globecoordinate':
            coord_args = map(float, commandline_claims[i + 1].split(','))
            if len(coord_args) >= 3:
                precision = coord_args[2]
            else:
                precision = 0.0001  # Default value (~10 m at equator)
            target = pywikibot.Coordinate(coord_args[0], coord_args[1], precision=precision)
        else:
            raise NotImplementedError(
                "%s datatype is not yet supported by claimit.py"
                % claim.getType())
        claim.setTarget(target)
        claims.append(claim)

    generator = gen.getCombinedGenerator()
    if not generator:
        # FIXME: Should throw some help
        return

    bot = ClaimRobot(generator, claims, exists_arg)
    bot.run()

if __name__ == "__main__":
    main()