User:AkkakkBot/code/05-links-to-labels

python code:

        site = pywikibot.Site("wikidata", "wikidata")
        repo = site.data_repository()
        db = MySQLdb.connect(host="wikidatawiki.labsdb", db="wikidatawiki_p", read_default_file="~/replica.my.cnf")

        editmax = 3000
        log = ""
        exit_reason = "end of data"
        summary_max_len = "100"

        language_map = {}
        cur = db.cursor()
        cur.execute('select * from s51880_akkakkbot.languages;')
        lines = cur.fetchall()
        for line in lines:
            language_map[line[0]] = line[1]

        #get items
        print("get items")
        sys.stdout.flush()
        items = []
        cur = db.cursor()
        #cur.execute('select ips_item_id from wb_items_per_site join s51880_akkakkbot.languages on lang_db = ips_site_id and ips_site_page not like "%, %" and ips_site_page not like "% (%)" and not (lang_db = "alswiki" or lang_db = "crhwiki" or lang_db = "nowiki" or lang_db = "bat_smgwiki" or lang_db = "be_x_oldwiki" or lang_db = "fiu_vrowiki" or lang_db = "roa_rupwiki" or lang_db = "zh_classicalwiki" or lang_db = "zh_min_nanwiki" or lang_db = "zh_yuewiki") and not exists (select term_language from wb_terms where ips_item_id = term_entity_id and lang_wd = term_language and term_type = "label") limit 50000;')
        cur.execute('select ips_item_id from wb_items_per_site join s51880_akkakkbot.languages on lang_db = ips_site_id and ips_site_page not like "%, %" and ips_site_page not like "% (%)"  and not exists (select term_language from wb_terms where ips_item_id = term_entity_id and lang_wd = term_language and term_type = "label") limit 50000;')
        lines = cur.fetchall()
        for line in lines:
            for cell in line:
                if(not cell in items):
                    items.append("Q{}".format(cell))

        #iterate items
        print("iterate items")
        editcnt = 0
        try:
            for i, q in enumerate(items):
                sys.stdout.flush()
                print("checking {}:".format(q)),
                if(os.path.isfile("bot-05-links-to-labels.stop")):
                    print("stop file")
                    exit_reason = "stop file"
                    break
                item = pywikibot.ItemPage(repo, q)
                try:
                    content = item.get()
                    links = content['sitelinks']
                    labels = content['labels']
                    descriptions = content['descriptions']
                    changed = False
                    summary_text = ""
                    for langdb in links:
                        langwd = langdbtowd(language_map, langdb)
                        if langwd != "" and not langwd in labels:
                            link = links[langdb]
                            description = ""
                            if langwd in descriptions:
                                description = descriptions[langwd]
                            if (re.match("(.+ \(.+\)|.+, .+)", link)):
                                print("- skip "+langwd+":contains brackets"),
                            #elif (description != "" and exists_item(db=db, language=langwd, label=link, description=description)):
                            #    print("- skip"+langwd+":there is an item with the same label+description"),
                            else:
                                print(u"- set "+langwd+u" label"),
                                labels[langwd] = link
                                if not (changed):
                                    log_line = "* [["+q+"]]:"
                                    changed = True
                                log_line += u" "+langwd+u": "+link
                                summary_text += u" - set "+langwd+u" label to "+link
                    if(changed):
                        editcnt += 1
                        print("- edit {}...".format(editcnt))
                        summary_text += " (task 5)"
                        if(len(summary_text) > summary_max_len):
                            summary_text = u"set labels to links (task 5)"
                        try:
                            item.editLabels(summary=summary_text, labels=labels)
                            log_line += "\n"
                            log += log_line
                        except TypeError:
                            print("  Type Error")
                            editcnt -= 1
                        if(editcnt >= editmax):
                            print("maximum number of edits reached")
                            exit_reason = "maximum number of edits reached"
                            break
                except pywikibot.exceptions.NoPage:
                    print("  item does not exist")
                except pywikibot.data.api.APIError:
                    print("  api error. trying to continue.")
                except UnicodeEncodeError:
                    print("  UnicodeEncodeError.why?")
        except Exception as exc:
            print("exception")
            traceback.print_exc()
            exit_reason = "exception"
                     
        if log != "":
            log += "exit reason:"+exit_reason
            pageobj = pywikibot.Page(site, u"User:AkkakkBot/log")
            pageobj.put(log, u"log for task 5: set unset labels to links", minorEdit = False)
        print("end of script")
        return exit_reason

'''
+------------------+----------+
| lang_db          | lang_wd  |
+------------------+----------+
| alswiki          | gsw      | *
| bat_smgwiki      | bat-smg  |
| be_x_oldwiki     | be-x-old |
| cbk_zamwiki      | cbk-zam  |
| crhwiki          | crh-latn | *
| fiu_vrowiki      | fiu-vro  |
| map_bmswiki      | map-bms  |
| nds_nlwiki       | nds-nl   |
| nowiki           | nb       | *
| roa_rupwiki      | roa-rup  |
| roa_tarawiki     | roa-tara |
| simplewiki       | en       |
| zh_classicalwiki | lzh      | *
| zh_min_nanwiki   | nan      | *
| zh_yuewiki       | yue      | *
+------------------+----------+
'''
def langdbtowd(language_map, langdb):
        #langignore = ["alswiki", "crhwiki", "nowiki", "simplewiki", "bat_smgwiki", "be_x_oldwiki", "fiu_vrowiki", "roa_rupwiki", "zh_classicalwiki", "zh_min_nanwiki, zh_yuewiki"]
        langignore = []
        if langdb in langignore:
            langwd = ""
        elif langdb in language_map:
            langwd = language_map[langdb]
        else:
            langwd = ""
        return langwd