User:JVbot/periodicalbot.py

#!/usr/bin/python
# -*- coding: utf-8  -*-

"""
This script creates Wikidata claims from enwp:Template:Infobox journal and Infobox magazine.

Pages to work on can be specified using any of:

&params;

"""

#
# (C) John Vandenberg, 2014
#
# Distributed under the terms of the MIT license.
#

import time # sleep
import stdnum.issn as stdnum_issn

import pywikibot
from pywikibot import pagegenerators

docuReplacements = {
    '&params;': pagegenerators.parameterHelp,
}

def getInfobox(templates,infobox_type = None):
    for (inPageTemplate, params) in templates:
        if (not infobox_type and ':infobox' in inPageTemplate.title().lower()) or infobox_type and ((':infobox ' + infobox_type in inPageTemplate.title().lower()) or (':info/' + infobox_type in inPageTemplate.title().lower()) or  (':ficha de ' + infobox_type in inPageTemplate.title().lower()) or (':' + infobox_type in inPageTemplate.title().lower())):
            if ':infobox ' in inPageTemplate.title().lower():
                page_infobox_type = inPageTemplate.title().lower().split(':')[1][len('infobox '):]
            elif ':ficha de ' in inPageTemplate.title().lower():
                page_infobox_type = inPageTemplate.title().lower().split(':')[1][len('ficha de '):]
            elif ':info/' in inPageTemplate.title().lower(): # pt
                page_infobox_type = inPageTemplate.title().lower().split(':')[1][len('info/'):]
            elif ':'+infobox_type in inPageTemplate.title().lower(): # other
                page_infobox_type = inPageTemplate.title().lower().split(':')[1]
            params.append('infobox_type='+page_infobox_type)
            return params

    print 'Templates:'
    seen = []
    for (inPageTemplate, params) in templates:
        if inPageTemplate.title() not in seen:
            print inPageTemplate.title()
            seen.append(inPageTemplate.title())

def getInfoboxField(infobox, field):
    for param in infobox:
        if param.lower().startswith(field.lower()+'='):
            return param[param.find('=')+1:]

def loadSources(family=None,language_codes=None,repo=None):
    """
    Fetches the sources from the onwiki list
    and stores it internally
    """
    import json
    if not repo:
        repo = pywikibot.Site('wikidata','wikidata')
    print 'Fetching wiki site items'
    page = pywikibot.Page(repo, u'Wikidata:List of wikis/python')
    # TODO: cache page
    source_values = json.loads(page.get())
    if family:
        source_values = source_values[family]
    #for source_lang in source_values:
    #    if not language_codes or source_lang in language_codes:
    #        source_values[source_lang] = pywikibot.ItemPage(repo, source_values[source_lang])

    # Todo - if all requested langauges are not in array; raise exception
        #raise Exception('Unsupported source language %s' % lang)

    return source_values

def loadClaims(filename, property_id):
    claims = {}

    f = open(filename, 'rb')

    property_id = str(property_id)

    for line in f:
        #print line
        q, p, v = line.strip().split("\t")
        #print "%s, %s, %s" % (q, p, v)
        if p.strip() == property_id:
            v = v.strip()[1:]
            q = 'Q'+q.strip()

            if v not in claims:
                claims[v] = []
            claims[v].append(q)
            #print 'Added ' + q

    f.close()
    return claims

def loadISSNs(filename):
    # file format is same as parse_xml_bz2.php comes from https://bitbucket.org/magnusmanske/wikidataquery
    # can be obtained via : cat dump.xml | parse_xml_bz2.php | grep '[[:space:]]236[[:space:]]'

    return loadClaims(filename, 236)

def main():
    args = pywikibot.handleArgs()

    to_q = None
    testsitename = False
    verbose = False
    force = False
    infobox_type = 'journal'
    gen_args = []

    genFactory = pagegenerators.GeneratorFactory()

    for arg in args:
        if genFactory.handleArg(arg):
            gen_args.append(arg)
            continue
        elif arg.startswith('-test:'):
            testsitename = arg[len('-test:'):]
        elif arg.startswith('-f'):
            force = True
        elif arg.startswith('-type'):
            infobox_type = arg[len('-type:'):]
            if infobox_type.lower().replace('_',' ') == u'presse ecrite':
                infobox_type = u'presse écrite'
        elif arg.startswith('-q'):
            to_q = arg[len('-q:'):]
            force = True
        else:
            raise Exception('Unknown argument')

    site = pywikibot.getSite()
    
    if testsitename:
        (test_lang,test_family) = testsitename.split('.')
        datasite = pywikibot.getSite(test_lang,test_family)
        if datasite:
            print "Using %s" % datasite.sitename()
        else:
            raise Exception('Failed to get a test site to play with')
        datasite = datasite.data_repository()
    else:
        datasite = site.data_repository()

    gen = genFactory.getCombinedGenerator()
    if not gen:
        raise Exception('Target pages not specified')

    gen_args = ' '.join(gen_args)

    if infobox_type == 'journal' or infobox_type == 'revue' or infobox_type == u'revista científica' or infobox_type == 'akademisk tidskrift':
        instance_of_qid = '5633421' # scientific journal
    elif infobox_type == 'magazine':
        instance_of_qid = '41298' # magazine
    elif infobox_type == 'newspaper':
        instance_of_qid = '11032' # newspaper
    elif infobox_type == 'publikation' or infobox_type == 'tijdschrift' or infobox_type == 'revista' or infobox_type == u'presse écrite':
        instance_of_qid = '1092563' # 'periodical literature'
    else:
        raise Exception('Unknown type %s' % infobox_type)
    
    lang = site.language()

    enwp_qid = source_wp_qid = '328' # English Wikipedia

    if lang != 'en':
        sources = loadSources('wikipedia',language_codes=[lang],repo=datasite)
        source_wp_qid = sources[lang][1:]

    issns = loadISSNs('dumps/issn_claims.tab')
    print "loaded %d issns" % len(issns)

    for page in gen:
        #pywikibot.output(u"Initialising %s ..." % page.title() )
        item = None
        wp_qid = source_wp_qid
        infobox_type_req = None #get the first
        if force:
            infobox_type_req = infobox_type # find the right one
                
        if to_q:
            if to_q != '-1':
                item = pywikibot.ItemPage(datasite,'Q'+to_q)
        else:
            item = pywikibot.ItemPage.fromPage(page)

        if item:
            if not item.exists():
                pywikibot.output(u"%s does not already exist in Wikidata." % page.title() )
                item = None
            else:
                try:
                    item.get()
                except:
                    pywikibot.output(u"Failed loading %s item %s, Skipping." % (page.title(), item.title()) )
                    continue

        if item:
            if 'P357' in item.claims and 'P31' in item.claims and 'P236' in item.claims:
                if verbose:
                    pywikibot.output(u"%s already has the necessary claims..." % page.title() )
                continue

            if lang == 'zh' and not force:
                if item.sitelinks:
                    if 'enwiki' not in item.sitelinks:
                        other_langs = item.sitelinks.keys()
                        other_langs.remove(lang+'wiki')
                        if len(other_langs) == 0:
                            print u"%s doesnt exist on any other wikipedia..." % page.title()
                        elif len(other_langs) == 1:
                            print u"%s also exists on %s.wikipedia..." % (page.title(), other_langs[0][:2])
                        else:
                            print u"%s doesnt exist on enwiki; it does exist on: %s" % (page.title(), u','.join(other_langs))
                        continue
                    else:
                        print u"Loading enwiki %s for %s ..." % (item.sitelinks['enwiki'], page.title() )
                        page = pywikibot.Page( pywikibot.Site('en','wikipedia'), item.sitelinks['enwiki'])
                        wp_qid = enwp_qid
                        infobox_type_req = None
                        # TODO: also change all uses of 'lang' to en
                else:
                    raise Exception('Items without any sitelinks cant be processed yet')

        try:
            infobox = getInfobox(page.templatesWithParams(), infobox_type_req)
        except:
            pywikibot.output(u"Failed to load %s. Sleeping ..." % page.title() )
            time.sleep(3)
            try:
                infobox = getInfobox(page.templatesWithParams(), infobox_type_req)
            except:
                pywikibot.output(u"Failed to load %s again. Sleeping & skipping ..." % page.title() )
                time.sleep(3)
                continue

        if not infobox:
            print 'Page %s doesnt have an infobox; skipping' % page.title()
            continue

        page_infobox_type = abbr = getInfoboxField(infobox, 'infobox_type')

        if page_infobox_type != infobox_type:
            print 'Page %s first infobox is of type %s instead of %s; skipping' % (page.title(),page_infobox_type,infobox_type)
            continue

        title = None
        subtitle = None

        if lang == 'en' or lang == 'zh':
            title = getInfoboxField(infobox, 'title')
            if not title:
                title = getInfoboxField(infobox, 'name')
        elif lang == 'de' or lang == 'nl' or lang == 'sv':
            title = getInfoboxField(infobox, 'titel')
        elif lang == 'fr':
            title = getInfoboxField(infobox, 'titre')
            if not title:
                title = getInfoboxField(infobox, 'nom') #infobox Presse ecrite
        elif lang == 'pt' or lang == 'es':
            title = getInfoboxField(infobox, 'título')
        else:
            raise Exception('Unsupported title language %s' % lang)

        issn = getInfoboxField(infobox, 'ISSN')
        eissn = getInfoboxField(infobox, 'eISSN')
        if lang == 'fr' and not eissn and infobox_type == u'presse écrite':
            eissn = getInfoboxField(infobox, u'ISSN électronique')

        if eissn == issn:
            eissn = None

        if getInfoboxField(infobox, 'ISSN2'):
            print 'multiple ISSNs indicating many parts, skipping'
            continue

        try:
            if issn:
                if len(issn.strip()) > 9:
                    print 'trimming %s' % issn
                    issn = issn.strip()[0:9]

                #issn = issn.replace(u' ','-')
                if not stdnum_issn.is_valid(issn):
                    print 'Page %s has invalid ISSN: %s' % (page.title(), issn)
                    issn = None
                elif stdnum_issn.format(issn) != issn:
                    print 'Page %s ISSN %s reformated to %s' % (page.title(), issn,stdnum_issn.format(issn))
                    issn = stdnum_issn.format(issn)
                    
            if eissn:
                if not stdnum_issn.is_valid(eissn):
                    print 'Page %s has invalid eISSN: %s' % (page.title(), eissn)
                    eissn = None
                elif stdnum_issn.format(eissn) != eissn:
                    print 'Page %s eISSN %s reformated to %s' % (page.title(), eissn,stdnum_issn.format(eissn))
                    eissn = stdnum_issn.format(eissn)
        except:
            print 'Failure on page %s during issn checking for %s and %s' % (page.title(), issn, eissn)
            continue

        if issn:
            if issn in issns:
                issn_qs = issns[issn]
                print 'Page %s has an issn of %s, which is currently registered to %s' % (page.title(), issn, ','.join(issn_qs))
                if not item or item.title() not in issn_qs:
                    print '... which is not linked to the page'
                    continue
            else:
                print 'Page %s ISSN %s is not in Wikidata' % (page.title(), issn)
            
        if eissn:
            if eissn in issns:
                issn_qs = issns[eissn]
                print 'Page %s has an issn of %s, which is currently registered to %s' % (page.title(), eissn, ','.join(issn_qs))
                if not item or item.title() not in issn_qs:
                    print '... which is not linked to the page'
                    continue
            else:
                print 'Page %s ISSN %s is not in Wikidata' % (page.title(), eissn)

        if item and 'P357' in item.claims and 'P31' in item.claims and ('P236' in item.claims or (not issn and not eissn)):
            print 'Page %s doesnt have any metadata to be added' % page.title()
            continue

        if not title:
            if infobox_type in page.title().lower():
                print 'Page %s doesnt have "title" param, but page title confirms type' % page.title()
            elif issn or eissn:
                print 'Page %s doesnt have "title" param' % page.title()
                print '...defaulting to page title; double check this'
            else:
                print 'Page %s doesnt have "title" param and cant be verified, .. skipping' % page.title()
                continue
                
            title = page.title()
            if title.endswith(' ('+infobox_type+')'):
                title = title.split(' ('+infobox_type+')')[0]

        if '<br' in title: # <br>, <br/>, etc
            print "trimming title %s" % title
            title = title[0:title.find('<br')]

        # This is occasionally used for long titles on enwp
        if lang == 'en':
            title = title.replace('{{no wrap|','').replace('{{nowrap|','').replace('}}','').replace("''",'').replace('&#124;','|')
        # frwp uses this
        elif lang == 'fr':
            # get the first language only
            title = title.split('}}')[0].replace('{{Lang|','{{lang|').replace('{{lang|en|','').replace('{{lang|de|','').replace('{{lang|fr|','').replace('{{lang|la|','').replace('{{lang|es|','').replace('{{lang|el|','').replace('{{lang|it|','').replace('{{lang|pt|','').replace('{{lang|cr|','').replace('texte=','').replace("''",'')

        if ': ' in title:
            subtitle = title[title.find(': ')+2:].strip()
            title = title.split(': ')[0].strip() # strip to fix 'abc : def'

        if ' - ' in title:
            (title,subtitle) = title.split(' - ')
            subtitle = subtitle.strip()

        # Sometimes the periodical infobox is on an article about the organisation - detect this, or other title mismatches
        if not force and (title.replace('The ','').replace('La ','').lower().replace('.','').replace(' and ',' & ').replace(u'’',"'") not in page.title().lower().replace('.','').replace(' and ',' & ').replace(u'’',"'")):
            if item and ('en' not in item.labels or (title not in item.labels['en'] and title not in item.labels[lang])) and ('en' not in item.aliases or title not in item.aliases['en']):
                print "Infobox title %s is not in the page title %s or item label or aliases; possibly an organisation with a periodical, or a periodical series" % (title,page.title())
                continue

        # Special cases to avoid
        if lang != 'en' and title == 'Proceedings of the Royal Society':
            continue

        if lang == 'zh' and not force:
            # By default, this bot expects a human to review the above output 
            # and check for duplicates before forcing the bot to proceed
            continue

        if not item:
            if to_q=='-1':
                if not issn and not eissn:
                    pywikibot.output(u"Skipping creation of item from %s as it doesnt have any unique id (ISSN, etc)" % page.title())
                    continue

                pywikibot.output(u"Creating item from %s" % page.title())
            else:
                pywikibot.output(u"Skipping creation of item from %s; add -q=-1 to create" % page.title())
                continue
        else:
            pywikibot.output(u"Adding claims from page %s to %s" % (page.title(),item.title()) )

        # TODO: default to putting labels in the language of the source item
        # put 'Back To The Roots' in labels[en]
        #if item and ('en' not in item.labels or (title not in item.labels['en'] and title not in item.labels[lang]))

        if not item:
            data = {'labels': {'en': {'language': 'en', 'value': title}},
                    'descriptions': {'en': {'language': 'en', 'value': infobox_type}},
                    'sitelinks': {'enwiki': {'site': 'enwiki', 'title': page.title()}},
                    'claims':[]
                   }
        else:
            if 'P357' in item.claims:
                print 'Item %s already has "title" claim' % item.title()
            if 'P31' in item.claims:
                print 'Item %s already has "instance of" claims' % item.title()
            if 'P236' in item.claims:
                print 'Item %s already has ISSN claims' % item.title()
            elif not issn and not eissn:
                print 'Page %s doesnt have any "ISSN" params' % page.title()

            data = {'claims':[]}

        source_snak = {"snaks":{"P143":[
                         {"snaktype":"value","property":"P143",
                          "datavalue":{"value":{"entity-type": "item","numeric-id":wp_qid},"type":"wikibase-entityid"}
                         }
                      ]}}

        if not item or (item and 'P357' not in item.claims): # title
            data['claims'].append({"mainsnak":{"snaktype":"value","property":"P357","datavalue":{"value":title,"type":"string"}}, "type":"statement", "rank":"normal", "references": [source_snak]})
            
        if subtitle and (not item or (item and 'P392' not in item.claims)): # subtitle
            data['claims'].append({"mainsnak":{"snaktype":"value","property":"P392","datavalue":{"value":subtitle,"type":"string"}}, "type":"statement", "rank":"normal", "references": [source_snak]})

        if not item or (item and 'P31' not in item.claims): # instance of
            data['claims'].append({"mainsnak":{"snaktype":"value","property":"P31","datavalue":{"value":{"entity-type": "item","numeric-id":instance_of_qid},"type":"wikibase-entityid"}}, "type":"statement", "rank":"normal", "references": [source_snak]}) 

        if issn and (not item or (item and 'P392' not in item.claims)): # issn
            data['claims'].append({"mainsnak":{"snaktype":"value","property":"P236","datavalue":{"value":issn,"type":"string"}}, "type":"statement", "rank":"normal", "references": [source_snak]})

        if eissn and (not item or (item and 'P392' not in item.claims)): # eissn
            data['claims'].append({"mainsnak":{"snaktype":"value","property":"P236","datavalue":{"value":eissn,"type":"string"}}, "type":"statement", "rank":"normal", "references": [source_snak]})

        if not item:
            item = pywikibot.ItemPage(datasite)

        #print data

        try:
            item.editEntity(data)
        except:
            pywikibot.output(u"Failed to save data for %s. Sleeping ..." % page.title() )
            time.sleep(3)
            try:
                item.editEntity(data)
            except:
                pywikibot.output(u"Failed to save data for %s again. Sleeping & skipping ..." % page.title() )
                time.sleep(3)
                continue

if __name__ == "__main__":
    try:
        main()
    finally:
        pywikibot.stopme()