#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
"""
Usage:
python harvest_template.py -lang:nl -template:"Taxobox straalvinnige" orde P70 familie P71 geslacht P74
This will work on all pages that transclude the template in the article
namespace
You can use any typical pagegenerator to provide with a list of pages:
python harvest_template.py -lang:nl -cat:Sisoridae -template:"Taxobox straalvinnige" -namespace:0 orde P70 familie P71 geslacht P74
¶ms;
"""
#
# (C) Multichill, Amir, 2013
# (C) Pywikibot team, 2013
#
# Distributed under the terms of MIT License.
#
__version__ = '$Id: 2507544f311b7164e04c7c83198a891f33e9f8ee $'
#
import re
import json
import pywikibot
from pywikibot import pagegenerators as pg
from datetime import datetime
from datetime import timedelta
docuReplacements = {'¶ms;': pywikibot.pagegenerators.parameterHelp}
class HarvestRobot:
"""
A bot to add Wikidata claims
"""
def __init__(self, generator, templateTitle, fields, overwrite=False):
"""
Arguments:
* generator - A generator that yields Page objects.
* templateTitle - The template to work on
* fields - A dictionary of fields that are of use to us
* overwrite - if existing claims should be overwritten
"""
self.generator = generator
self.templateTitle = templateTitle.replace(u'_', u' ')
# TODO: Make it a list which also includes the redirects to the template
self.fields = fields
self.overwrite = overwrite
self.repo = pywikibot.Site().data_repository()
self.cacheSources()
def getSource(self, site):
"""
Get the source for the specified site,
if possible
"""
if site.family.name in self.source_values and site.code in self.source_values[site.family.name]:
source = pywikibot.Claim(self.repo, 'P143')
source.setTarget(self.source_values.get(site.family.name).get(site.code))
return source
def cacheSources(self):
"""
Fetches the sources from the onwiki list
and stores it internally
"""
page = pywikibot.Page(self.repo, u'List of wikis/python', ns=4)
self.source_values = json.loads(page.get())
for family_code, family in self.source_values.iteritems():
for source_lang in family:
self.source_values[family_code][source_lang] = pywikibot.ItemPage(self.repo,
family[source_lang])
def run(self):
"""
Starts the robot.
"""
self.templateTitles = self.getTemplateSynonyms(self.templateTitle)
for i, page in enumerate(self.generator):
try:
self.procesPage(i, page)
except Exception as e:
pywikibot.exception(tb=True)
def newItem(self, page, item):
"""
Create item where none exists (from newitem.py by Multichill)
"""
self.pageAge = 21
self.pageAgeBefore = self.repo.getcurrenttime() - timedelta(days=self.pageAge)
self.lastEdit = 7
self.lastEditBefore = self.repo.getcurrenttime() - timedelta(days=self.lastEdit)
if page.isRedirectPage():
pywikibot.output('%s is a redirect page. Skipping.' % page)
elif page.namespace() == 2:
pywikibot.output('%s is a user page. Skipping.' % page)
elif page.editTime() > self.lastEditBefore:
pywikibot.output('Last edit on %s was on %s. Too recent. Skipping.' % (page, page.editTime().isoformat()))
else:
(revId, revTimestamp, revUser, revComment) = page.getVersionHistory(reverseOrder=True, total=1)[0]
if revTimestamp > self.pageAgeBefore:
pywikibot.output('Page creation of %s on %s is too recent. Skipping.' % (page, page.editTime().isoformat()))
elif page.langlinks():
# FIXME: Implement this
pywikibot.output('Found language links (interwiki links). Haven\'t implemented that yet so skipping.')
else:
# FIXME: i18n
summary = u'Bot: New item with sitelink from %s' % (page.title(asLink=True, insite=self.repo), )
data = {'sitelinks':
{item.getdbName(page.site):
{'site': item.getdbName(page.site),
'title': page.title()}
},
'labels':
{page.site.lang:
{'language': page.site.lang,
'value': page.title()}
}
}
pywikibot.output(summary)
item.editEntity(data, summary=summary)
def getTemplateSynonyms(self, title):
"""
Fetches redirects of the title, so we can check against them
"""
pywikibot.output('Finding redirects...') # Put some output here since it can take a while
temp = pywikibot.Page(pywikibot.Site(), title, ns=10)
if temp.isRedirectPage():
temp = temp.getRedirectTarget()
titles = [page.title(withNamespace=False)
for page
in temp.getReferences(redirectsOnly=True, namespaces=[10], follow_redirects=False)]
titles.append(temp.title(withNamespace=False))
return titles
def extractUrl(self, page, value, claim):
"""
Extract url datatype from field
"""
match = re.search("(http[^|\s\]\}]+)", value) # try format {{URL|htp://www.example.com}} or [http://www.example.com Example]
if match is None:
match = re.search("{{\w*\|(www\S+)}}", value) # try format {{URL|www.example.com}}
if match is None:
match = re.search("{{\w*\|(\S+)}}", value) # try format {{URL|example.com}}
extractedUrl = "http://www." + match.group(1)
else:
extractedUrl = "http://" + match.group(1)
else:
extractedUrl = match.group(1)
claim.setTarget(extractedUrl)
pywikibot.output('Extracting %s --> %s' % (value, extractedUrl))
def monthToNumber(self, page, month):
"""
Returns an integer from the month name
"""
# return if the month is already int
try:
month = int(month)
return month
except ValueError:
pass
# try formats from date.py
month_dictio = {}
languageEdition = page.site.language()
getMonthInt = pywikibot.date.getAutoFormat
if getMonthInt(languageEdition, month)[1]:
month = getMonthInt(languageEdition, month)[1]
else:
if languageEdition == "cs": # Česky (Czech)
month_dictio = {"ledna": "1", "února": "2", "března": "3", "dubna": "4",
"května": "5", "června": "6", "července": "7", "srpna": "8",
"října": "10", "listopadu": "11", "prosince": "12"}
elif languageEdition == "fr": # Français
month_dictio = {"mars": "3"}
elif languageEdition == "ru": # Russian
month_dictio = {"января": "1", "февраля": "2", "марта": "3", "апреля": "4",
"мая": "5", "июня": "6", "июля": "7", "августа": "8",
"сентября": "9", "октября": "10", "ноября": "11", "декабря": "12"}
for monthName, monthNumber in month_dictio.items():
if month == monthName:
month = month.replace(monthName, monthNumber)
break
try:
month = int(month)
except ValueError:
print month + " is not a valid month"
month = None
return month
def extractTime(self, page, value, claim):
"""
Extract time from field
"""
extractedMonth = None
extractedDay = None
regexDict = {"\{\{(?:[dD]ate|[fF]echa|[dD]ni|[fF]alec).*?\|(\d{1,2})\|(\w*)\|(\d{1,4})": "ddMMyyyy", # templates, format 1 January 2000
"(\d{1,2})\.?\s(\w*)\]{0,2},?\s\[{0,2}(\d{1,4})": "ddMMyyyy", # format 1 January 2000
"(\d{1,2})[\.\/](\w*)[\.\/](\d{3,4})": "ddMMyyyy", # format 01.01.2000
"(\d{3,4})[\.\/-](\w*)[\.\/-](\d{1,2})": "yyyymmdd", # format 2000-01-01
"(\d{1,2})\sde\s(\w*)\]{0,2}\sde\s\[{0,2}(\d{1,4})": "ddMMyyyy", # format [[1 de enero]] de [[2000]]
"(\w*)\.?\s(\d{1,2}),\s(\d{1,4})": "MMddyyyy", # format January 1, 2000
"\{\{\w(?:irth|eath|tart|ilm).*?(\d{1,4})[\|\}](?:\w*=)?(?:\d{3,4}|(\d{0,2}))[\|\}](?:\w*=)?(\d{0,2})": "yyyymmdd", # English templates, format 2000 01 01
"([\d,]+)(?:\s*| )(?:BC|bc|av. J)": "yyyy BC"} # years BC, format 1000 BCE
for regex, timeFormat in regexDict.items():
match = re.search(regex, value, re.UNICODE)
if match is not None:
if timeFormat == "ddMMyyyy": # day, month name, year
extractedMonth = self.monthToNumber(page, match.group(2))
if extractedMonth is None:
match = None
continue
else:
extractedYear = int(match.group(3))
extractedDay = int(match.group(1))
break
elif timeFormat == "MMddyyyy": # month name, day, year
extractedMonth = self.monthToNumber(page, match.group(1))
if extractedMonth is None:
match = None
continue
else:
extractedYear = int(match.group(3))
extractedDay = int(match.group(2))
break
elif timeFormat == "yyyymmdd": # year, month, day
if match.group(2):
extractedMonth = int(match.group(2))
if match.group(3):
extractedDay = int(match.group(3))
extractedYear = int(match.group(1))
break
elif timeFormat == "yyyy BC": # year BCE
extractedYear = "-" + match.group(1).replace(",", "") # remove commas, if any
break
if match is None:
match = re.search("^\[{0,2}(-?\d{1,4})(?!\d*(st|nd|rd|th))", value) # last resort
extractedYear = match.group(1)
timeclaim = pywikibot.WbTime(year=extractedYear, month=extractedMonth, day=extractedDay)
claim.setTarget(timeclaim)
pywikibot.output('Extracting %s --> %s-%s-%s' % (value, extractedYear, extractedMonth, extractedDay))
def procesPage(self, index, page):
"""
Proces a single page
"""
item = pywikibot.ItemPage.fromPage(page)
pywikibot.output(u'Processing No. %s: %s' % (index, page))
if not item.exists():
# create the page
self.newItem(page, item)
item = pywikibot.ItemPage.fromPage(page)
if not item.exists():
# The item was not created
return
pagetext = page.get()
templates = pywikibot.extract_templates_and_params(pagetext)
for (template, fielddict) in templates:
# Clean up template
template = pywikibot.Page(page.site, template,
ns=10).title(withNamespace=False)
# We found the template we were looking for
if template in self.templateTitles:
for field, value in fielddict.items():
field = field.strip()
value = value.strip()
# This field contains something useful for us
if field in self.fields:
claim = pywikibot.Claim(self.repo, self.fields[field])
# Check if the property isn't already set
if self.overwrite is False:
if claim.getID() in item.get().get('claims'):
pywikibot.output(
u'%s already exists (-overwrite to change it)'
% claim.getID())
continue
# TODO FIXME: This is a very crude way of dupe
# checking
if str(item.getVersionHistory(total=50)).find("wbremoveclaims-remove:1| */ [[Property:" + claim.getID()) is not -1:
pywikibot.output('%s cannot be added as it was recently removed from the item' % (claim.getID(),))
else:
match = re.findall("\[\[(?!(?:File:|Image:))([^|]*?)(?:\|.*?)?\]\]", value)
match2 = re.findall("\{\{[fF]lag(?!icon).*?\|([^|]*?)(?:\|.*?)?\}\}", value)
for additional in match2:
match.insert(0, additional)
for value in match:
value = "[[" + value + "]]"
if claim.getType() == 'wikibase-item':
# Try to extract a valid page
match = re.search(pywikibot.link_regex, value)
if match is None:
pywikibot.output('No valid item found for %s' % (claim.getID(),))
continue
else:
try:
link = pywikibot.Link(match.group(1))
linkedPage = pywikibot.Page(link)
if linkedPage.isRedirectPage():
linkedPage = linkedPage.getRedirectTarget()
linkedItem = pywikibot.ItemPage.fromPage(linkedPage)
claim.setTarget(linkedItem)
if not linkedItem.title():
print " " # this is only to raise NoPage
if linkedPage.isDisambig(): # avoid adding disambiguation pages
pywikibot.output('%s is a disambiguation page. Skipping.' % (linkedPage,))
continue
except pywikibot.exceptions.NoPage:
pywikibot.output('%s doesn\'t exist so I can\'t link to it' % (linkedPage,))
continue
elif claim.getType() == 'string':
if value == "":
pywikibot.output('No valid string found for %s' % (claim.getID(),))
continue
else:
match = re.search("^(?!(?:\{|\[http))\W*([^<>]+)(?<![<>])", value, re.UNICODE) # avoid tags, linka and templates
if match:
value = match.group(1)
claim.setTarget(value.strip())
else:
pywikibot.output('%s includes forbidden characters' % (value,))
continue
elif claim.getType() == 'url':
try:
self.extractUrl(page, value, claim)
except AttributeError:
pywikibot.output('No valid URL found for %s' % (claim.getID(),))
continue
elif claim.getType() == 'time':
try:
self.extractTime(page, value, claim)
except AttributeError:
pywikibot.output('No valid time for %s' % (claim.getID(),))
continue
elif claim.getType() == 'commonsMedia':
mediasite = pywikibot.Site("commons", "commons")
if value == "":
pywikibot.output('No valid media file found for %s' % (claim.getID(),))
continue
else:
match = re.search("\w{4,5}:([^|\]]*)", value) # extract filename
if match is not None:
value = "File:" + match.group(1)
elif not value.startswith("File:"):
value = "File:" + value
# check if the image exists on Commons
image = pywikibot.ImagePage(mediasite, value)
if image.exists() is False:
pywikibot.output('%s does not exist on Commons' % (value,))
continue
else:
claim.setTarget(image)
else:
pywikibot.output("%s is not a supported datatype." % claim.getType())
continue
alreadythere = False
try:
for existingclaim in item.get().get('claims')[self.fields[field]]:
if claim.getTarget() == existingclaim.getTarget():
print ("%s already exists with %s" %
(existingclaim.getID(), existingclaim.getTarget()))
alreadythere = True
break
except KeyError:
pass
if alreadythere:
continue
pywikibot.output('Adding %s --> %s' % (claim.getID(), claim.getTarget()))
item.addClaim(claim)
# A generator might yield pages from multiple sites
source = self.getSource(page.site)
if source:
claim.addSource(source, bot=True)
def main():
gen = pg.GeneratorFactory()
commandline_arguments = list()
templateTitle = u''
overwrite = False
for arg in pywikibot.handleArgs():
if arg.startswith('-template'):
if len(arg) == 9:
templateTitle = pywikibot.input(
u'Please enter the template to work on:')
else:
templateTitle = arg[10:]
elif arg.startswith('-overwrite'):
overwrite = True
elif gen.handleArg(arg):
continue
else:
commandline_arguments.append(arg)
if len(commandline_arguments) % 2 or not templateTitle:
raise ValueError # or something.
fields = dict()
for i in range(0, len(commandline_arguments), 2):
fields[commandline_arguments[i]] = commandline_arguments[i + 1]
generator = gen.getCombinedGenerator()
if not generator:
# transcluding generator based on templateTitle
transclusionPage = pywikibot.Page(
pywikibot.Link(
templateTitle, defaultNamespace=10, source=pywikibot.Site()
)
)
generator = pywikibot.Site().page_embeddedin(
transclusionPage, filterRedirects=None,
namespaces=0, step=None, total=None, content=False
)
bot = HarvestRobot(generator, templateTitle, fields, overwrite)
bot.run()
if __name__ == "__main__":
main()