User:JVbot/wikipedia-sync.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This script watches a group of Wikipedia pages for Wikidata and Wikipedia issues

"""
#
# (C) John Vandenberg, 2014
#
# Distributed under the terms of the MIT license.

import pywikibot
from pywikibot import pagegenerators

import time

class WikiDataBase:
    """
    Monitors a closely related set of Wikipedia pages using Wikidata.
    Continually polls for changes to support frequently updating events (e.g. sports championships).
    """
    item = None # base item to monitor
    items = None # all items processed

    unlinked_pages = None

    category_matrix = None

    category_only = False
    first_pass_completed = False

    def __init__(self, page, category_only = False):
        """
        Arguments:
            * event - page of the event to monitor
        """
        self.item = page
        self.items = {}
        self.unlinked_pages = {}
        self.category_matrix = {}
        self.category_only = category_only

    def item_wikipedia_languages(self, item):
        sites = item.sitelinks.keys()
        sites.sort()
        wikipedias = []
        for site in sites:
            if site.endswith('wiki') and site != 'commonswiki':
                if site[0:-4] == 'no':
                    wikipedias.append('nb')
                else:
                    wikipedias.append(site[0:-4])
        return wikipedias    

    def process_category_page(self, category_page, parents=True, recurse=True, articles=True):
        #pywikibot.output('processing (category) %s' % (category_page.title(forceInterwiki=True)) )
        item = self.process_page(category_page)
        if item:
            key = item.title()
            lang = category_page.site.lang
            if item.title() in self.category_matrix:
                if category_page.site.lang in self.category_matrix[key]:
                    return
                else:
                    self.category_matrix[key][lang] = []
            else:
                self.category_matrix[key] = {lang: [] }

            if 'P31' in item.claims and item.claims['P31'][0].getTarget().title() == 'Q15647814':
                return

        if recurse:
            for subcat in category_page.subcategories():
                subcat_item = self.process_category_page(subcat)
                if item and subcat_item:
                    self.category_matrix[key][lang].append(subcat_item.title())

        if parents:
            for parent_cat in category_page.categories():
                self.process_page(parent_cat, cats=parents)

        if articles:
            # get all items for all pages in category
            for page in category_page.articles():
                page_item = self.process_page(page) #,cats=parents)
                if item and page_item:
                    self.category_matrix[key][lang].append(page_item.title())

        return item

    def process_page(self, page, cats=False):
        if page.site.lang == 'fr' and page.title().endswith("mars aux Jeux paralympiques d'hiver de 2014"):
            return
        elif (page.site.lang == 'no' or page.site.lang == 'nb') and ' under Paralympiske ' in page.title() and not ('2014' in page.title() or '2010' in page.title()):
            return
        elif page.title().startswith('Template:2014 Winter Paralympics wheelchair curling'):
            return

        #pywikibot.output('processing (page) %s' % (page.title(forceInterwiki=True)) )

        page_item = page.data_item()
        if not page_item.exists():
            key = page.title(asLink=True,forceInterwiki=True).replace('[[','').replace(']]','')
            if key not in self.unlinked_pages:
                pywikibot.output('%s does not exist in Wikidata' % key)
                self.unlinked_pages[key] = page
                if self.first_pass_completed:
                    self.find_unlinked_matches(page)
            return

        page_item = self.process_item(page_item, page=page)

        if cats:
            for cat in page.categories():
                if not cat.exists():
                    key = cat.title(asLink=True,forceInterwiki=True).replace('[[','').replace(']]','')
                    if key not in self.unlinked_pages:
                        pywikibot.output('%s on %s doesnt exist at all' % (key,page.title()) )
                        self.unlinked_pages[key] = cat
                else:
                    self.process_page(cat, cats=False)

        return page_item

    # page must be one of the sitelinks; it can be any of them
    # it is used to determine the type of page in this item, which is presumed to be the same for all sitelinks
    def process_item(self, item, page=None):
        quiet = False

        if item.title() in self.items:
            return self.items[item.title()]
            
        item.get()

        if not item.exists():
            return

        if 'en' in item.labels:
            item_label = item.labels['en']
        elif len(item.labels):
            label_lang = item.labels.keys()[0]
            item_label = item.labels[label_lang] + u' (' + label_lang + u')'
        else:
            item_label = u'NO LABEL IN ANY LANGUAGE!!'

        item_label = item_label + u' (' + item.title() + u')'

        if 'en' not in item.labels:
            pywikibot.output('%s does not have an English label' % item_label)

        if 'P31' not in item.claims:
            if page and page.isCategory():
                pywikibot.output('%s should have an instance of: Q15647814 (admin) or Q4167836 (content)' % item_label )
            else:
                pywikibot.output('%s should have an instance of' % item_label )
        elif 'P31' in item.claims and item.claims['P31'][0].getTarget().title() == 'Q15647814':
            quiet = True

        if not quiet:
            if len(item.sitelinks.keys()) == 1 and item.sitelinks.keys()[0] != 'enwiki':
                pywikibot.output('%s only exists on %s' % (item_label,item.sitelinks.keys()[0]) )
            elif 'enwiki' not in item.sitelinks.keys():
                pywikibot.output('%s exists on %d wikis but not enwiki' % (item_label,len(item.sitelinks.keys()) ) )

        self.items[item.title()] = item
        return item

    def process_item_pages(self, item, wikipedias = None):
        item.get()
        if not wikipedias:
            wikipedias = self.item_wikipedia_languages(item)

        for lang in wikipedias:
            if lang == 'nb':
                pagename = item.sitelinks['nowiki']
            else:
                pagename = item.sitelinks[lang+'wiki']

            page = pywikibot.Page( pywikibot.Site(lang,'wikipedia') , pagename)
            self.process_page(page, cats=True)

    def process_category_item(self, category_item, wikipedias = None, recurse = True , parents = True, articles=True ):
        category_item.get()
        if not wikipedias:
            wikipedias = self.item_wikipedia_languages(category_item)

        for lang in wikipedias:
            if lang == 'nb':
                pagename = category_item.sitelinks['nowiki']
            else:
                pagename = category_item.sitelinks[lang+'wiki']

            if not self.first_pass_completed:
                pywikibot.output('Finding pages on %s.wikipedia' % lang)
            category_page = pywikibot.Category( pywikibot.Site(lang.replace('_','-'),'wikipedia') , pagename)
            self.process_category_page(category_page)

    def processWLH(self):
        wikipedias = self.item_wikipedia_languages(self.item)
        for lang in wikipedias:
            pagename = self.item.sitelinks[lang+'wiki']
            if not self.first_pass_completed:
                pywikibot.output('Finding linked pages on %s.wikipedia' % lang)
            main_page = pywikibot.Page( pywikibot.Site(lang,'wikipedia') , pagename)
            
            gen = pagegenerators.ReferringPageGenerator(main_page)
            gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces=[0])
        
            for page in gen:
                if self.process_page(page, cats=False):
                    pywikibot.output('%s is related but not in the category tree' % page.title())

    def find_unlinked_matches(self, page):
        page_title = page.title()
        if ' (' in page_title:
            page_title = page_title.split(' (')[0]

        count = 0
        gen = pagegenerators.SearchPageGenerator(page_title, namespaces=[0], total=11, site=self.item.site)
        for item in gen:
            if count == 10:
                pywikibot.output(u'.. and more results exist')
                break

            count = count + 1
            item = pywikibot.ItemPage(self.item.site, item.title())
            item.get()
            if 'en' in item.labels:
                item_label = item.labels['en']
            elif len(item.labels) == 0:
                item_label = '<none>'
            else:
                item_label = item.labels[item.labels.keys()[0]]

            pywikibot.output(u'%s might be %s (%s)' % (page.title(asLink=True,forceInterwiki=True).replace('[[','').replace(']]',''), item_label, item.title() ))

    def find_all_unlinked_matches(self):
        pywikibot.output('---- Now looking for matches for all unlinked pages ----')
        for key, page in self.unlinked_pages.iteritems():
            self.find_unlinked_matches(page)
        pywikibot.output('---- Finished looking for matches for all unlinked pages ----')

    def find_missing_cats(self):
        # this isnt the most efficient of algorithms.
        pywikibot.output('---- Now looking for categories to populate ----')
        for category_qid in self.category_matrix:
            category_data = self.category_matrix[category_qid]
            category_item = self.items[category_qid]
            category_langs = self.item_wikipedia_languages(category_item)
            all_contents = []
            for lang in category_langs:
                if lang == 'no':
                    lang = 'nb'
                if lang not in category_data:
                    pywikibot.output(u'%s lang %s wasnt processed for articles, or category is empty' % (category_qid,lang))
                    continue
                all_contents += category_data[lang]
            all_contents = set(all_contents)

            all_lang_data = {}
            for page_qid in all_contents:
                page_item = self.items[page_qid]
                for lang in self.item_wikipedia_languages(page_item):
                    if lang == 'no':
                        lang = 'nb'
                    if lang not in all_lang_data:
                        all_lang_data[lang] = [page_qid]
                    else:
                        all_lang_data[lang].append(page_qid)

            for lang in all_lang_data.keys():
                if lang in category_data:
                    if len(category_data[lang]) == len(all_lang_data[lang]):
                        del all_lang_data[lang]
                else:
                    if len(all_lang_data[lang]) == 1:
                        #pywikibot.output(u'%s in new lang %s has only one possible item: %s; skipping' % (category_qid, lang, all_lang_data[lang][0]) )
                        del all_lang_data[lang]

            if not len(all_lang_data):
                continue

            label_lang = None
            if 'en' in category_item.labels:
                label_lang = 'en'
            elif len(category_item.labels):
                label_lang = sorted(category_item.labels.keys())[0]

            pywikibot.output(u'%s (%s) improvements:' % (category_qid, category_item.labels[label_lang]) )

            new_wikipedias = set(all_lang_data.keys()) - set(category_data.keys())

            modified_wikipedias = set(all_lang_data.keys()) - new_wikipedias

            pywikibot.output(u'   existing category %s additions on wikis %s:' % (category_qid, u','.join(modified_wikipedias) ) )

            for lang in modified_wikipedias:
                if lang == 'nb':
                    slang = 'nowiki'
                else:
                    slang = lang+'wiki'
                    
                msg_prefix = u'     %s:%s ' % (lang,category_item.sitelinks[slang])

                category_lang_missing = set(all_lang_data[lang]) - set(category_data[lang])
                pywikibot.output(msg_prefix + u'is missing the following pages: %s' % (','.join(category_lang_missing)))

            new_wikipedias_with_labels = new_wikipedias & (set(category_item.labels.keys()) - set(category_langs))

            if not new_wikipedias_with_labels:
                pywikibot.output(u'   potential new categories for %s on wikis %s (none have labels):' % (category_qid, u','.join(new_wikipedias) ) )
            else:
                pywikibot.output(u'   potential new categories for %s on wikis %s:' % (category_qid, u','.join(new_wikipedias) ) )
                new_wikipedias = list(new_wikipedias_with_labels) + list(set(new_wikipedias) - set(new_wikipedias_with_labels))

            for lang in new_wikipedias:
                msg_prefix = u'     '
                if lang in category_item.labels:
                    msg_prefix += u'%s:%s ' % (lang,category_item.labels[lang])
                else:
                    msg_prefix += u'%s:?? ' % (lang) 

                if lang == 'no':
                    lang = 'nb'

                pywikibot.output(msg_prefix + u'should be created with items: %s' % (','.join(all_lang_data[lang])))

    def refresh_unlinked(self):
        found = []
        for key, page in self.unlinked_pages.iteritems():
            page_item = page.data_item()
            if page_item.exists():
                pywikibot.output(u'%s now exists in Wikidata as %s; processing..' % (key, page_item.title()) )
                found.append(key)
                self.process_page(page)

        for key in found:
            del self.unlinked_pages[key]

    def refresh(self):
        """
        Populates the database.
        """
        self.item.get()

        self.process_item(self.item)
        
        is_category_item = 'P31' in self.item.claims and self.item.claims['P31'][0].getTarget().title() == 'Q4167836'

        category_item = None
        if 'P910' in self.item.claims:
            category_item = self.item.claims['P910'][0].getTarget()
        elif is_category_item:
            category_item = self.item

        if self.first_pass_completed:
            self.refresh_unlinked()

        if not is_category_item:
            pywikibot.output('---- Processing the pages linked to this Q ----')
            self.process_item_pages(self.item)

        if category_item:
            pywikibot.output('---- Now looking at category (%s) on all wikipedias ----' % category_item.title())
            self.process_category_item(category_item)
        
        if not category_item or (not is_category_item and not self.category_only):
            pywikibot.output('---- Now looking at what links here ----')
            self.processWLH()

    def run(self,sleep=None):
        while True:
            self.refresh()
            if not self.first_pass_completed:
                self.find_all_unlinked_matches()
                self.find_missing_cats()
                self.first_pass_completed = True

            if sleep:
                pywikibot.output(u"Sleeping for %d minutes ..." % sleep )
                time.sleep(sleep*60)

def main():
    args = pywikibot.handleArgs()
    qid = None
    site = pywikibot.getSite()
    datasite = site.data_repository()
    poll = True
    articles = True
    parents = False

    gen = pagegenerators.GeneratorFactory()
    for arg in args:
        # Handle page generator args
        if gen.handleArg(arg):
            continue
        elif arg.startswith('-qid:'):
            qid = int(arg[len('-qid:'):])
        elif arg == '-once':
            poll = False
        elif arg == '-structure':
            articles = False
        elif arg == '-parents':
            articles = True
        else:
            raise Exception('Unknown command line option')

    if not qid:
        raise Exception('Need a -qid parameter')
        return

    item = pywikibot.ItemPage(datasite, 'Q'+str(qid))
    item.get()

    generator = gen.getCombinedGenerator()

    db = WikiDataBase(item,category_only=True)

    if generator:
        for page in generator:
            db.process_page(page, cats=True)
    else:
        try:
            if not poll:
                db.refresh()
                db.find_all_unlinked_matches()
                db.find_missing_cats()
            else:
                db.run(1)
        except KeyboardInterrupt:
            pass

if __name__ == "__main__":
    main()