Kolduxrep
Joined 18 July 2021
Queries edit
Sources edit
Misc edit
Compare Swiss municipality entries with the official government lists |
---|
#!/usr/bin/env python3
# Check the Wikidata entries of all municipalities of Switzerland.
#
# Copyright (C) 2021 Kolduxrep
# https://www.wikidata.org/wiki/User:Kolduxrep
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
# Load official government data
from openpyxl import load_workbook
wb = load_workbook(filename="Gemeindestand.xlsx")
ws = wb["Daten"]
import pandas as pd
df = pd.DataFrame(ws.values)
df.columns = df.iloc[0]
df = df.iloc[1:]
assert len(df["BFS Gde-nummer"]) == len(
pd.unique(df["BFS Gde-nummer"])
), "Nonunique municipality code in official government data"
assert len(df["Gemeindename"]) == len(
pd.unique(df["Gemeindename"])
), "Nonunique municipality name in official government data"
df.rename(
columns={"BFS Gde-nummer": "code", "Kanton": "canton", "Gemeindename": "name"},
inplace=True,
)
df = df[["code", "canton", "name"]]
df = df.set_index("code")
df.sort_index(inplace=True)
df_gov = df
# Load Wikidata entries
df_wikidata = pd.read_csv("query.csv")
df_wikidata.rename(
columns={
"municipality": "identifier",
"swissMunicipalityCode": "code",
"cantonCode": "canton",
"municipalityLabel": "name",
},
inplace=True,
)
df_wikidata = df_wikidata.drop_duplicates("identifier")
assert len(df_wikidata["code"]) == len(
pd.unique(df_wikidata["code"])
), "Nonunique municipality code in Wikidata entries"
df_wikidata = df_wikidata.set_index("code")
df_wikidata.sort_index(inplace=True)
df_wikidata
# Compute the differences
diff_index = df_gov.index.symmetric_difference(df_wikidata.index)
df_only_gov = df_gov.loc[diff_index.intersection(df_gov.index)]
df_only_wikidata = df_wikidata.loc[diff_index.intersection(df_wikidata.index)]
df_diff = df_only_wikidata.append(df_only_gov)
df_diff = df_diff.fillna(value="")
if not df_diff.empty:
df_diff.to_csv("diff.csv")
assert df_diff.empty, "Differences between official government data and Wikidata exist"
# What remains to be done is to check the cantons, municipality names, and other fields.
|