User:Underlying lk/wdhelper.py

#!/usr/bin/python
# -*- coding: utf-8 -*-

from __future__ import unicode_literals
"""
Helper functions to handle values extracted from templates and
add them to a Wikidata claim

Save it to the core/pywikibot folder

Datatypes are explained in
https://www.wikidata.org/wiki/Special:ListDatatypes

The following datatypes are supported:
Item, Commons media, String, Time, URL, Quantity
Globe coordinates are currently NOT supported
"""
import re
import json
import pywikibot

repo = pywikibot.Site().data_repository()


def extractItem(claim, value, page, item):
    """
    Extract item
    """
    match = re.search(pywikibot.link_regex, value)
    if match is None:
        pywikibot.output('No valid item found for %s' % (claim.getID(),))
        return
    else:
        try:
            link = pywikibot.Link(match.group(1), page.site)
            linkedPage = pywikibot.Page(link)
            if linkedPage.isRedirectPage():
                linkedPage = linkedPage.getRedirectTarget()
            linkedItem = pywikibot.ItemPage.fromPage(linkedPage)
            if not linkedItem.title():
                print " "  # this is only to raise NoPage
            # avoid adding disambiguation pages
            if linkedPage.isDisambig():  
                pywikibot.output('%s is a disambiguation page. Skipping.' % (linkedPage,))
                return
            # the object of a claim should not be the same as the item
            if item == linkedItem:
                print "The target and the item are identical, skipping"
                return
            claim.setTarget(linkedItem)
            return claim
        except pywikibot.exceptions.NoPage:
            pywikibot.output('%s doesn\'t exist so I can\'t link to it' % (linkedPage,))
            return


def extractMedia(claim, value):
    """
    Extract a valid media file
    """
    mediasite = pywikibot.Site("commons", "commons")
    if value == "":
        pywikibot.output('No valid media file found for %s' % (claim.getID(),))
        return
    else:
        match = re.search("\w{2,5}:([^|\]]*)", value, re.UNICODE)  # extract filename
        if match is not None:
            value = "File:" + match.group(1)
        elif not value.startswith("File:"):
            value = "File:" + value
        # check if the image exists on Commons
        image = pywikibot.ImagePage(mediasite, value)
        if image.exists() is False:
            pywikibot.output('%s does not exist on Commons' % (value,))
            return
        else:
            claim.setTarget(image)
            return claim


def extractString(claim, value):
    """
    Takes a claim and a value
    extracts a valid string from the value
    and adds it to the claim
    """
    if value == "":
        pywikibot.output('No valid string found for %s' % (claim.getID(),))
        return
    # avoid tags, linka and templates
    match = re.search("^(?!(?:\{|\[http))\W*([^<>]+)(?<![<>])", value, re.UNICODE)

    if match:
        value = match.group(1)
        claim.setTarget(value.strip())
    else:
        pywikibot.output('%s includes forbidden characters' % (value,))
        return
    return claim


def extractTime(page, value, claim):
    """
    Extract time from field
    """
    extractedMonth = None
    extractedDay = None
    regexDict = {
        # templates, format 1 January 2000
        "\{\{(?:[dD]ate|[fF]echa|[dD]ni|[fF]alec).*?\|(\d{1,2})\|(\w*)\|(\d{1,4})": "ddMMyyyy",
        # format 1 January 2000
        "(?:(\d{1,2})[\.º]?\s)?(\w*)\]{0,2},?\s\[{0,2}(\d{1,4})": "ddMMyyyy",
        # format 01.01.2000
        "(\d{1,2})[\.\/](\w*)[\.\/](\d{3,4})": "ddMMyyyy",
        # format 2000-01-01
        "^(?:(?!access).)*?(\d{3,4})[\.\/-](\w*)[\.\/-](\d{1,2})": "yyyymmdd",
        # format [[1 de enero]] de [[2000]]
        "(\d{1,2})\sde\s(\w*)\]{0,2}\sde\s\[{0,2}(\d{1,4})": "ddMMyyyy",
        # format January 1, 2000
        "(\w*)\.?\s(\d{1,2}),\s(\d{1,4})": "MMddyyyy",
        # English templates, format 2000 01 01
        "\{\{\w(?:irth|eath|tart|ilm).*?(\d{1,4})[\|\}](?:\w*=)?(?:\d{3,4}|(\d{0,2}))[\|\}](?:\w*=)?(\d{0,2})": "yyyymmdd",
        # years BC, format 1000 BCE
        "([\d,]+)(?:\s*|&nbsp;)(?:BC|bc|av. J|a.C.)": "yyyy BC"
    }
    for regex, timeFormat in regexDict.items():
        match = re.search(regex, value, re.UNICODE)
        if match is not None:
            if timeFormat == "ddMMyyyy":  # day, month name, year
                extractedMonth = monthToNumber(page, match.group(2))
                if extractedMonth is None:
                    match = None
                    continue
                else:
                    extractedYear = int(match.group(3))
                    extractedDay = match.group(1)
                    if extractedDay != None:
                        extractedDay = int(extractedDay)
                    break
            elif timeFormat == "MMddyyyy":  # month name, day, year
                extractedMonth = monthToNumber(page, match.group(1))
                if extractedMonth is None:
                    match = None
                    continue
                else:
                    extractedYear = int(match.group(3))
                    extractedDay = int(match.group(2))
                    break
            elif timeFormat == "yyyymmdd":  # year, month, day
                if match.group(2):
                    extractedMonth = int(match.group(2))
                if match.group(3):
                    extractedDay = int(match.group(3))
                extractedYear = int(match.group(1))
                break
            elif timeFormat == "yyyy BC":  # year BCE
                extractedYear = "-" + match.group(1).replace(",", "")  # remove commas, if any
                break
    if match is None:
        match = re.search("^\[{0,2}(-?\d{1,4})(?!\d*(st|nd|rd|th))", value)  # last resort    
        try:
            extractedYear = match.group(1)
        except AttributeError:
            pywikibot.output('No valid time for %s' % (claim.getID(),))
            return        
    timeclaim = pywikibot.WbTime(year=extractedYear, month=extractedMonth, day=extractedDay)
    claim.setTarget(timeclaim)
    pywikibot.output('Extracting %s --> %s-%s-%s' % (value, extractedYear, extractedMonth, extractedDay))
    return claim


def monthToNumber(page, month):
    """
    Returns an integer from the month name
    """
    # return if the month is already int
    try:
        month = int(month)
        return month
    except ValueError:
        pass
    # try formats from date.py
    month_dictio = {}
    languageEdition = page.site.language()
    getMonthInt = pywikibot.date.getAutoFormat
    if getMonthInt(languageEdition, month)[1]:
        month = getMonthInt(languageEdition, month)[1]
    else:
        if languageEdition == "cs":   # Česky (Czech)
            month_dictio = {"ledna": "1", "února": "2", "března": "3", "dubna": "4",
                            "května": "5", "června": "6", "července": "7", "srpna": "8",
                            "října": "10", "listopadu": "11", "prosince": "12"}
        elif languageEdition == "fr":   # Français
            month_dictio = {"mars": "3"}
        elif languageEdition == "ru":   # Russian
            month_dictio = {"января": "1", "февраля": "2", "марта": "3", "апреля": "4",
                            "мая": "5", "июня": "6", "июля": "7", "августа": "8",
                            "сентября": "9", "октября": "10", "ноября": "11", "декабря": "12"}
        for monthName, monthNumber in month_dictio.items():
            if month == monthName:
                month = month.replace(monthName, monthNumber)
                break
    try:
        month = int(month)
    except ValueError:
        print month + " is not a valid month"
        month = None
    return month


def extractUrl(value, claim):
    """
    Extract url datatype from field
    """
    # try format {{URL|http://www.example.com}} or [http://www.example.com Example]
    match = re.search("(http[^|\s\]\}]+)", value)
    if match is not None:
        extractedUrl = match.group(1)
    else:
        # try format {{URL|www.example.com}}
        match = re.search("{{\w*\|(www\S+)}}", value)
        if match is not None:
            extractedUrl = "http://" + match.group(1)
        else:
            # try format {{URL|example.com}}
            match = re.search("{{\w*\|(\S+)}}", value)
            if match is not None:
                extractedUrl = "http://www." + match.group(1)
            else:
                pywikibot.output('No valid URL found for %s' % (claim.getID(),))
                return            
    claim.setTarget(extractedUrl)
    pywikibot.output('Extracting %s --> %s' % (value, extractedUrl))
    return claim            


def extractQuantity(page, item, claim, value):
    """
    some basic support for quantities
    data is added directly through the api
    overwrite will not work for them
    """
    item = pywikibot.ItemPage(repo, item.title())
    languageEdition = page.site.language()
    regex = r'(?:^|[\s\-–~>:約])((?:(?<!\d)-?)\d[\d,\.\'\s]*)(\smillion)?(?:[\s<+/\}人]|$)'
    match = re.search(regex, value, re.UNICODE)
    if match is None:
        print value
        pywikibot.output("No valid quantity found")
        return
    if languageEdition != 'en' and 'formatnum' not in value:
        # comma separates decimal, point ignored
        number = match.group(1).replace(",", ".")
        number = number.replace(".", "")
    else:
        # point separates decimal, comma ignored
        number = match.group(1).replace(",", "")
    number = number.replace("'", "")
    number = number.replace(" ", "")
    if match.group(2):
        number = str(int((float(number) * 1000000)))
    if not number.startswith("-"):
        number = "+" + number
    pywikibot.output('Extracting %s --> %s' % (value, number))
    quantity = {"amount": number,
                "unit": "1",
                "upperBound": number,
                "lowerBound": number
                }
    params = {'action': 'wbcreateclaim',
              'entity': item.title(),
              'property': claim.getID(),
              'snaktype': 'value',
              'value': json.dumps(quantity),
              'bot': 1,
              'token': repo.token(item, 'edit')
              }
    req = pywikibot.data.api.Request(site=repo, **params)
    pywikibot.output('Adding %s --> %s' % (claim.getID(), quantity))
    data = req.submit()
    item.get()
    lastclaim = len(item.claims[claim.getID()]) - 1
    return item.claims[claim.getID()][lastclaim]


def matchDatatype(pid, value, page, item):
    """
    Determines the claim's datatype
    and the function to use
    - pid: property id ('P731')
    """
    claim = pywikibot.Claim(repo, pid)
    if claim.getType() == 'wikibase-item':
        claim = extractItem(claim, value, page, item)
    elif claim.getType() == 'string':
        claim = extractString(claim, value)
    elif claim.getType() == 'url':
        claim = extractUrl(value, claim)
    elif claim.getType() == 'time':
        claim = extractTime(page, value, claim)
    elif claim.getType() == 'commonsMedia':
        claim = extractMedia(claim, value)
    elif claim.getType() == 'quantity':
        claim = extractQuantity(page, item, claim, value)
    else:
        pywikibot.output("%s is not a supported datatype." % claim.getType())
        return
    return claim