User:MedalBot/medalbot.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This script adds medals to Wikidata

"""
#
# (C) John Vandenberg, 2014
#
# Distributed under the terms of the MIT license.

import pywikibot
from pywikibot import pagegenerators

import pickle
import time
import copy

def getPageTemplate(templates,name):
    for (inPageTemplate, params) in templates:
        if (':' in name and inPageTemplate.title() == name) or (':' not in name and inPageTemplate.title().split(':')[1] == name):
            return params

def getTemplateList(templates):
    seen = []
    for (inPageTemplate, params) in templates:
        if inPageTemplate.title() not in seen:
            seen.append(inPageTemplate.title())
    return seen

def printTemplateList(templates):
    if not len(templates):
        print 'No templates'
        return

    template_names =  getTemplateList(templates)       
    pywikibot.output(u"Templates: %s" % u','.join(template_names))

def getInfobox(templates,infobox_type = None):
    page_infobox_type = None
    if infobox_type:
        print "getting %s" % infobox_type

    for (inPageTemplate, params) in templates:
        #print params
        if infobox_type:
            if ':infobox ' + infobox_type in inPageTemplate.title().lower() or ':info/' + infobox_type in inPageTemplate.title().lower() or  ':ficha de ' + infobox_type in inPageTemplate.title().lower() or ':'+infobox_type in inPageTemplate.title().lower():
                page_infobox_type = infobox_type
        else:
            if ':infobox ' in inPageTemplate.title().lower():
                page_infobox_type = inPageTemplate.title().lower().split(':')[1][len('infobox '):]
            elif ':ficha de ' in inPageTemplate.title().lower():
                page_infobox_type = inPageTemplate.title().lower().split(':')[1][len('ficha de '):]
            elif ':info/' in inPageTemplate.title().lower(): # pt
                page_infobox_type = inPageTemplate.title().lower().split(':')[1][len('info/'):]
            elif inPageTemplate.title().lower().endswith('infobox'): # pl
                page_infobox_type = inPageTemplate.title().lower().split(':')[1][0:-8]

        if page_infobox_type:
            params.append('infobox_type='+page_infobox_type)
            return params

def getMedalTemplates(page):
    known_templates = ['Template:MedalSport','Template:MedalCompetition','Template:MedalCountry',
                           'Template:MedalGold','Template:MedalSilver','Template:MedalBronze',
                           'Template:Medal','MedalEuropeanChampionships','Template:MedalOlympic','Template:MedalWorldChampionships']
    medal_templates = []
    for template in page.templatesWithParams():
        (inPageTemplate, dummy) = template
        if inPageTemplate.title() in known_templates:
            medal_templates.append(template)
    return medal_templates

def getLinkFromWikitext(text):
    link = None
    link_text = text
    if '[[' in link_text:
        link_text = link_text.replace('[[','').replace(']]', '')
    try:
        link = pywikibot.page.Link(link_text)
    except:
        pywikibot.output(u"Failed to load link from text: %s" % text)
    return link

def getPageFromLink(link):
    page = None
    try:
        page = pywikibot.Page(link)
        if page.isRedirectPage():
            page = page.getRedirectTarget()
    except Exception, e:
        print e
        pywikibot.output(u"Failed to load page from link: %s" % link.astext() )
    return page

def getPageFromWikitext(text):
    link = getLinkFromWikitext(text)
    if link:
        return getPageFromLink(link)

def getItemIDFromPage(page):
    item = pywikibot.ItemPage.fromPage(page)
    if item.exists():
        qid = item.title()[1:]
        return qid

    print 'Item doesnt exist for page %s' % page.title()

def getItemIDFromLink(link):
    page = getPageFromLink(link)
    if page:
        return getItemIDFromPage(page)

def getItemIDFromWikitext(text):
    page = getPageFromWikitext(text)
    if page:
        return getItemIDFromPage(page)

def getMedalData(page):
    medalbox = {'medals':[]}
    medal = {}
    #templates = page.templatesWithParams()
    #print '--*--'
    #printTemplateList(templates)
    #print '--*--'
    #
    #medaltemplates = getInfoboxField( getInfobox(templates) , 'medaltemplates')
    #print 'infobox medaltemplates:'
    #print medaltemplates
    medal_templates = getMedalTemplates(page)
    print medal_templates
    
    for (inPageTemplate, params) in medal_templates:
        template = inPageTemplate.title()
        
        if template == 'Template:MedalSport':
            print u'found MedalSport: %s' % params[0]
            qid = getItemIDFromWikitext(params[0])
            if qid:
                medal['sport'] = qid
        if template == 'Template:MedalCompetition':
            print u'found MedalCompetition: %s' % params[0]
            qid = getItemIDFromWikitext(params[0])
            if qid:
                medal['at'] = qid
        elif template in ['Template:MedalGold','Template:MedalSilver','Template:MedalBronze']:
            print 'found ' + template
            medal_type = template[14:].lower()

            competition_qid = getItemIDFromWikitext(params[0])
            medal_for_qid = None
            medal_for_link = None
            if params[1]:
                medal_for_link = getLinkFromWikitext(params[1])
                if medal_for_link:
                    medal_for_qid = getItemIDFromLink(medal_for_link)

            medal['type'] = medal_type
            medal['at'] = competition_qid

            if medal_for_qid:
                medal['for'] = medal_for_qid
            if medal_for_link:
                medal['for_title'] = medal_for_link.title

            medalbox['medals'].append(copy.copy(medal))

    return medalbox

def getInfoboxField(infobox, field):
    #print infobox
    if not infobox:
        return None
    field_start = field.lower() + u'='
    for param in infobox:
        if type(param) == str or type(param) == unicode:
            if param.lower().startswith(field_start):
                return param[len(field_start):]


class EventDatabase:
    events = []    
    countries = []
    other_participants = []
    medalists = []
    junk_pages = []
    unknown_page = []
    db = None

    def load(self, filename):
        self.db = pickle.load(filename)
    
    def save(self, filename):
        pickle.dump(self.db, filename)

class WikipediaDatabase(EventDatabase):
    event_page = None

    datasite = None

    known_types = ['sport','person','unknown']

    def event(self, page):
        """
        Arguments:
            * event - page of the event to monitor
        """
        self.event_page = page
        self.datasite = page.site.data_repository()

    def source_claim(self):
        return (143, 328)
        # i.e. imported from English Wikipedia

    def name(self):
        if self.event_page:
            return self.event_page.title()

    def get_infobox(self, page):
        page_templates = page.templatesWithParams()

        try:
            infobox = getInfobox(page_templates)
        except Exception, e:
            print e
            pywikibot.output(u"Failed to load %s. Sleeping ..." % page.title() )
            time.sleep(3)
            try:
                infobox = getInfobox(page.templatesWithParams(), infobox_type_req)
            except Exception, e:
                pywikibot.output(u"Failed to load %s again. Sleeping & skipping ..." % page.title() )
                time.sleep(3)
                return

        return infobox

    known_medal_templates =['Template:MedalSport','Template:MedalCompetition','Template:MedalCountry',
                           'Template:MedalGold','Template:MedalSilver','Template:MedalBronze',
                           'Template:Medal','MedalEuropeanChampionships','Template:MedalOlympic','Template:MedalWorldChampionships']

    def import_page(self, page):
        item = pywikibot.ItemPage.fromPage(page)
        if item.exists():
            item_qid = int(item.title()[1:])
        else:
            item = None
            item_qid = -1
            
        infobox = self.get_infobox(page)
        page_type = None
        #"Canada's athletes of the 20th century"

        template_names = getTemplateList(page.templatesWithParams())
        medal_data_type = None

        if 'Template:MedalTableTop' in template_names or 'Template:MedalTableTopPic' in template_names:
            medal_data_type = 'table'
            if infobox:
                pywikibot.output(u"%s contains an infobox and a medal table" % page.title() )
        elif 'Template:Medal' in template_names:
            medal_data_type = 'medal'
            if not infobox:
                pywikibot.output(u"%s uses {{medal}} without an infobox" % page.title() )

        if infobox:
            page_type =  getInfoboxField(infobox, 'infobox_type')
            print '---'
            medal_templates = []
            for template_name in template_names:
                if 'Template:Medal' in template_name:
                    medal_data_type = 'infobox'
                    if template_name not in self.known_medal_templates:
                        medal_templates.append(template_name)
            if len(medal_templates):
                pywikibot.output(u"%s with infobox contains odd medal templates: %s" % (page.title(), u','.join(medal_templates) ) )
        else:
            page_type = 'unknown'
            if not medal_data_type:
                medal_templates = []
                for template_name in template_names:
                    if 'Template:Medal' in template_name:
                        medal_templates.append(template_name)
                if len(medal_templates):
                    pywikibot.output(u"%s contains odd medal templates without an infobox: %s" % (page.title(), u','.join(medal_templates) ) )

        pywikibot.output(u"%s(%d) is a %s" % (page.title(), item_qid, page_type))
        #pywikibot.output(u"---")
        #printTemplateList(page.templatesWithParams())
        #pywikibot.output(u"---")
        
        medaldata = getMedalData(page)
        if medaldata:
            # name should be parsed from the infobox, etc
            name = page.title()
            medalist = {'qid': item_qid, 'name': name, 'medals': medaldata['medals']}
            self.medalists.append(medalist)

    def processWLH(self):
        gen = pagegenerators.ReferringPageGenerator(self.event_page)
        gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces=[0])
        
        pywikibot.output(u"Loading pages that refer to %s" % self.event_page.title() )
        for page in gen:
            # TODO if page has not in the database, import; otherwise if it has changed, update
            self.import_page(page)
        pywikibot.output(u"Finished loading pages that refer to %s" % self.event_page.title() )

    def update(self):
        self.processWLH()

class MedalRobot:
    """
    A bot to add medals to Wikidata
    """
    source_snak = None

    competition_items = {}

    def __init__(self, database, datasite):
        """
        Arguments:
            * event - page of the event to monitor
        """
        self.database = database
        self.datasite = datasite

    def getCompetitionLabel(self,qid,lang='en'):
        if qid in self.competition_items:
            item = self.competition_items[qid]
            return item.labels[lang]

        try:
            item = pywikibot.ItemPage(self.datasite, 'Q'+str(qid))
            self.competition_items[qid] = item
            item.get()
        
            return item.labels[lang]
        except Exception, e:        
            pywikibot.output('Item for competition qid %s cant be loaded' % qid)
            print e
            return

    def medal_qid(self, competition_qid, medal_type):
        base_medal_qids = [406039,847956,873364]
        Olympic_medal_qids = [15243387,15889641,15889641]
        Paralympic_medal_qids = [15243424,15243447,15243454]

        competition_medal_qids = base_medal_qids

        competition_name = self.getCompetitionLabel(competition_qid)

        if 'Olympic' in competition_name:
            competition_medal_qids = Olympic_medal_qids
        elif 'Paralympic' in competition_name:
            competition_medal_qids = Paralympic_medal_qids
            
        award_qid = None
        if medal_type == 'gold':
            award_qid = competition_medal_qids[0]
        elif medal_type == 'silver':
            award_qid = competition_medal_qids[1]
        elif medal_type == 'bronze':
            award_qid = competition_medal_qids[2]
        else:
            print 'Unknown medal type: %s' % medal_type

        return award_qid

    def add_medal(self, item, award_qid, competition_qid, sport_qid):
        if not self.source_snak:
            (property_id, source_qid) = self.database.source_claim()
            self.source_snak = {"snaks":{"P"+str(property_id):[
                 {"snaktype":"value","property":"P"+str(property_id),
                  "datavalue":{"value":{"entity-type": "item","numeric-id":source_qid},"type":"wikibase-entityid"}
                 }
              ]}}

        print "Adding medal to %s" % (item.title())

        sport_qual = {"snaktype":"value","property":"P641",
                      "datavalue":{"value":{"entity-type": "item","numeric-id":sport_qid},"type":"wikibase-entityid"}}

        sigevent_qual = {"snaktype":"value","property":"P793",
                      "datavalue":{"value":{"entity-type": "item","numeric-id":competition_qid},"type":"wikibase-entityid"}}

        data = {'claims':[
                    {"mainsnak":{"snaktype":"value","property":"P166",
                                "datavalue":{"value":{"entity-type": "item","numeric-id":award_qid},"type":"wikibase-entityid"}},
                     "type":"statement","rank":"normal",
                     "references": [self.source_snak],
                     "qualifiers":[sport_qual,sigevent_qual]
                    }]}

        print data
        answer = pywikibot.inputChoice("Create?",['Yes', 'No', 'Always'], ['y', 'N', 'a'], 'N')
        if answer != 'y' and answer != 'a':
            return

        try:
            item.editEntity(data)
        except Exception, e:
            pywikibot.output(u"Failed to save data for %s." % page.title() )
            print e
            print "Sleeping ..."
            time.sleep(3)
            try:
                item.editEntity(data)
            except:
                pywikibot.output(u"Failed to save data for %s again. Sleeping & skipping ..." % page.title() )
                time.sleep(3)
    
    def sync_medals(self, item_qid, medals):
        item = pywikibot.ItemPage(self.datasite, 'Q'+str(item_qid))
        if not item.exists():
            # TODO: create
            return

        item.get()
        for medal in medals:
            competition_qid = medal['at']
            sport_qid = medal['sport']
            award_qid = self.medal_qid(competition_qid, medal['type'])
        
            if not award_qid:
                print "Not able to add a medal to %s" % item.title()
                print medal
                continue

            print 'processing...'
            print medal

            if "P166" in item.claims:
                found = False
                for claim in item.claims['P166']:
                    claim_award_qid = int(claim.target.title()[1:])
                    if claim_award_qid == award_qid:
                        if not claim.qualifiers or 'P793' not in claim.qualifiers or 'P641' not in claim.qualifiers:
                            print 'Found matching award without any qualifiers'
                            found = True
                            continue

                        if len(claim.qualifiers['P641']) > 1 or len(claim.qualifiers['P793']) > 1:
                            print 'strange claim %s' % claim.snak

                        claim_sport_qid = claim.qualifiers['P641'][0].target.title()[1:]
                        claim_at_qid = claim.qualifiers['P793'][0].target.title()[1:]
                        if claim_sport_qid == sport_qid and claim_at_qid == competition_qid:
                            print 'Found matching award, incl. the quals'
                            found = True
                            break

                        print 'Found matching award; however the quals are different'
                if found:
                    # TODO: there could be multiple identical medals for the same sport/competition
                    # still need to add medal to wikidata if source has more than wikidata
                    continue

            self.add_medal(item, award_qid, competition_qid, sport_qid)

    def update_items(self):
        for medalist in self.database.medalists:
            if medalist['qid']:
                self.sync_medals(medalist['qid'], medalist['medals'])

    def run(self, generator = None):
        """
        Starts the robot.
        """
        while True:
            if generator:
                for page in generator:
                    self.database.import_page(page)
            else:
                print 'building data for '+self.database.name()
                self.database.update()

            #self.create_items()
            self.update_items()
            # cant loop until update is working correctly
            return
            time.sleep(10)

def main():
    args = pywikibot.handleArgs()
    event = None
    site = pywikibot.getSite()
    datasite = site.data_repository()

    gen = pagegenerators.GeneratorFactory()
    for arg in args:
        # Handle page generator args
        if gen.handleArg(arg):
            continue
        elif arg.startswith('-event:'):
            event = arg[len('-event:'):]
        else:
            raise Exception('Unknown command line option')

    if not event:
        raise Exception('Need an event page')
        return

    event = pywikibot.Page(site, event)
    event.get()

    db = WikipediaDatabase()
    db.event(event)

    generator = gen.getCombinedGenerator()

    bot = MedalRobot(db, datasite)

    bot.run(generator)

if __name__ == "__main__":
    main()