#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
"""
Helper functions to handle values extracted from templates and
add them to a Wikidata claim
Save it to the core/pywikibot folder
Datatypes are explained in
https://www.wikidata.org/wiki/Special:ListDatatypes
The following datatypes are supported:
Item, Commons media, String, Time, URL, Quantity
Globe coordinates are currently NOT supported
"""
import re
import json
import pywikibot
repo = pywikibot.Site().data_repository()
def extractItem(claim, value, page, item):
"""
Extract item
"""
match = re.search(pywikibot.link_regex, value)
if match is None:
pywikibot.output('No valid item found for %s' % (claim.getID(),))
return
else:
try:
link = pywikibot.Link(match.group(1), page.site)
linkedPage = pywikibot.Page(link)
if linkedPage.isRedirectPage():
linkedPage = linkedPage.getRedirectTarget()
linkedItem = pywikibot.ItemPage.fromPage(linkedPage)
if not linkedItem.title():
print " " # this is only to raise NoPage
# avoid adding disambiguation pages
if linkedPage.isDisambig():
pywikibot.output('%s is a disambiguation page. Skipping.' % (linkedPage,))
return
# the object of a claim should not be the same as the item
if item == linkedItem:
print "The target and the item are identical, skipping"
return
claim.setTarget(linkedItem)
return claim
except pywikibot.exceptions.NoPage:
pywikibot.output('%s doesn\'t exist so I can\'t link to it' % (linkedPage,))
return
def extractMedia(claim, value):
"""
Extract a valid media file
"""
mediasite = pywikibot.Site("commons", "commons")
if value == "":
pywikibot.output('No valid media file found for %s' % (claim.getID(),))
return
else:
match = re.search("\w{2,5}:([^|\]]*)", value, re.UNICODE) # extract filename
if match is not None:
value = "File:" + match.group(1)
elif not value.startswith("File:"):
value = "File:" + value
# check if the image exists on Commons
image = pywikibot.ImagePage(mediasite, value)
if image.exists() is False:
pywikibot.output('%s does not exist on Commons' % (value,))
return
else:
claim.setTarget(image)
return claim
def extractString(claim, value):
"""
Takes a claim and a value
extracts a valid string from the value
and adds it to the claim
"""
if value == "":
pywikibot.output('No valid string found for %s' % (claim.getID(),))
return
# avoid tags, linka and templates
match = re.search("^(?!(?:\{|\[http))\W*([^<>]+)(?<![<>])", value, re.UNICODE)
if match:
value = match.group(1)
claim.setTarget(value.strip())
else:
pywikibot.output('%s includes forbidden characters' % (value,))
return
return claim
def extractTime(page, value, claim):
"""
Extract time from field
"""
extractedMonth = None
extractedDay = None
regexDict = {
# templates, format 1 January 2000
"\{\{(?:[dD]ate|[fF]echa|[dD]ni|[fF]alec).*?\|(\d{1,2})\|(\w*)\|(\d{1,4})": "ddMMyyyy",
# format 1 January 2000
"(?:(\d{1,2})[\.º]?\s)?(\w*)\]{0,2},?\s\[{0,2}(\d{1,4})": "ddMMyyyy",
# format 01.01.2000
"(\d{1,2})[\.\/](\w*)[\.\/](\d{3,4})": "ddMMyyyy",
# format 2000-01-01
"^(?:(?!access).)*?(\d{3,4})[\.\/-](\w*)[\.\/-](\d{1,2})": "yyyymmdd",
# format [[1 de enero]] de [[2000]]
"(\d{1,2})\sde\s(\w*)\]{0,2}\sde\s\[{0,2}(\d{1,4})": "ddMMyyyy",
# format January 1, 2000
"(\w*)\.?\s(\d{1,2}),\s(\d{1,4})": "MMddyyyy",
# English templates, format 2000 01 01
"\{\{\w(?:irth|eath|tart|ilm).*?(\d{1,4})[\|\}](?:\w*=)?(?:\d{3,4}|(\d{0,2}))[\|\}](?:\w*=)?(\d{0,2})": "yyyymmdd",
# years BC, format 1000 BCE
"([\d,]+)(?:\s*| )(?:BC|bc|av. J|a.C.)": "yyyy BC"
}
for regex, timeFormat in regexDict.items():
match = re.search(regex, value, re.UNICODE)
if match is not None:
if timeFormat == "ddMMyyyy": # day, month name, year
extractedMonth = monthToNumber(page, match.group(2))
if extractedMonth is None:
match = None
continue
else:
extractedYear = int(match.group(3))
extractedDay = match.group(1)
if extractedDay != None:
extractedDay = int(extractedDay)
break
elif timeFormat == "MMddyyyy": # month name, day, year
extractedMonth = monthToNumber(page, match.group(1))
if extractedMonth is None:
match = None
continue
else:
extractedYear = int(match.group(3))
extractedDay = int(match.group(2))
break
elif timeFormat == "yyyymmdd": # year, month, day
if match.group(2):
extractedMonth = int(match.group(2))
if match.group(3):
extractedDay = int(match.group(3))
extractedYear = int(match.group(1))
break
elif timeFormat == "yyyy BC": # year BCE
extractedYear = "-" + match.group(1).replace(",", "") # remove commas, if any
break
if match is None:
match = re.search("^\[{0,2}(-?\d{1,4})(?!\d*(st|nd|rd|th))", value) # last resort
try:
extractedYear = match.group(1)
except AttributeError:
pywikibot.output('No valid time for %s' % (claim.getID(),))
return
timeclaim = pywikibot.WbTime(year=extractedYear, month=extractedMonth, day=extractedDay)
claim.setTarget(timeclaim)
pywikibot.output('Extracting %s --> %s-%s-%s' % (value, extractedYear, extractedMonth, extractedDay))
return claim
def monthToNumber(page, month):
"""
Returns an integer from the month name
"""
# return if the month is already int
try:
month = int(month)
return month
except ValueError:
pass
# try formats from date.py
month_dictio = {}
languageEdition = page.site.language()
getMonthInt = pywikibot.date.getAutoFormat
if getMonthInt(languageEdition, month)[1]:
month = getMonthInt(languageEdition, month)[1]
else:
if languageEdition == "cs": # Česky (Czech)
month_dictio = {"ledna": "1", "února": "2", "března": "3", "dubna": "4",
"května": "5", "června": "6", "července": "7", "srpna": "8",
"října": "10", "listopadu": "11", "prosince": "12"}
elif languageEdition == "fr": # Français
month_dictio = {"mars": "3"}
elif languageEdition == "ru": # Russian
month_dictio = {"января": "1", "февраля": "2", "марта": "3", "апреля": "4",
"мая": "5", "июня": "6", "июля": "7", "августа": "8",
"сентября": "9", "октября": "10", "ноября": "11", "декабря": "12"}
for monthName, monthNumber in month_dictio.items():
if month == monthName:
month = month.replace(monthName, monthNumber)
break
try:
month = int(month)
except ValueError:
print month + " is not a valid month"
month = None
return month
def extractUrl(value, claim):
"""
Extract url datatype from field
"""
# try format {{URL|http://www.example.com}} or [http://www.example.com Example]
match = re.search("(http[^|\s\]\}]+)", value)
if match is not None:
extractedUrl = match.group(1)
else:
# try format {{URL|www.example.com}}
match = re.search("{{\w*\|(www\S+)}}", value)
if match is not None:
extractedUrl = "http://" + match.group(1)
else:
# try format {{URL|example.com}}
match = re.search("{{\w*\|(\S+)}}", value)
if match is not None:
extractedUrl = "http://www." + match.group(1)
else:
pywikibot.output('No valid URL found for %s' % (claim.getID(),))
return
claim.setTarget(extractedUrl)
pywikibot.output('Extracting %s --> %s' % (value, extractedUrl))
return claim
def extractQuantity(page, item, claim, value):
"""
some basic support for quantities
data is added directly through the api
overwrite will not work for them
"""
item = pywikibot.ItemPage(repo, item.title())
languageEdition = page.site.language()
regex = r'(?:^|[\s\-–~>:約])((?:(?<!\d)-?)\d[\d,\.\'\s]*)(\smillion)?(?:[\s<+/\}人]|$)'
match = re.search(regex, value, re.UNICODE)
if match is None:
print value
pywikibot.output("No valid quantity found")
return
if languageEdition != 'en' and 'formatnum' not in value:
# comma separates decimal, point ignored
number = match.group(1).replace(",", ".")
number = number.replace(".", "")
else:
# point separates decimal, comma ignored
number = match.group(1).replace(",", "")
number = number.replace("'", "")
number = number.replace(" ", "")
if match.group(2):
number = str(int((float(number) * 1000000)))
if not number.startswith("-"):
number = "+" + number
pywikibot.output('Extracting %s --> %s' % (value, number))
quantity = {"amount": number,
"unit": "1",
"upperBound": number,
"lowerBound": number
}
params = {'action': 'wbcreateclaim',
'entity': item.title(),
'property': claim.getID(),
'snaktype': 'value',
'value': json.dumps(quantity),
'bot': 1,
'token': repo.token(item, 'edit')
}
req = pywikibot.data.api.Request(site=repo, **params)
pywikibot.output('Adding %s --> %s' % (claim.getID(), quantity))
data = req.submit()
item.get()
lastclaim = len(item.claims[claim.getID()]) - 1
return item.claims[claim.getID()][lastclaim]
def matchDatatype(pid, value, page, item):
"""
Determines the claim's datatype
and the function to use
- pid: property id ('P731')
"""
claim = pywikibot.Claim(repo, pid)
if claim.getType() == 'wikibase-item':
claim = extractItem(claim, value, page, item)
elif claim.getType() == 'string':
claim = extractString(claim, value)
elif claim.getType() == 'url':
claim = extractUrl(value, claim)
elif claim.getType() == 'time':
claim = extractTime(page, value, claim)
elif claim.getType() == 'commonsMedia':
claim = extractMedia(claim, value)
elif claim.getType() == 'quantity':
claim = extractQuantity(page, item, claim, value)
else:
pywikibot.output("%s is not a supported datatype." % claim.getType())
return
return claim