User:ANU Outreachy/Outreachy 3
import pywikibot
import re
import requests
from bs4 import BeautifulSoup
enwiki = pywikibot.Site('en', 'wikipedia')
enwiki_repo = enwiki.data_repository()
def findInception(page): #for finding information from Category sections
found = re.findall(r'(?im)\[\[\s*Category\s*:\s*(\d+) software\s*[\|\]]', page.text) #regex expression to pick out the year ofinception from Category section
print('P571: ' + found[0] + ' software')
def findInstanceOf(page): #for finding information from Category sections
found = re.findall(r'(?im)\[\[\s*Category\s*:\s*(\w+) operating systems\s*[\|\]]', page.text) #regex expression to pick out the
print('P31: ' + found[0] + ' operating systems')
def findprop(propertyname, propertyid, page):
string = r'\|\s*%s\s+\=\s*.+'% propertyname #form regex expression to find items from infobox
found_items = re.findall(string, page.text) #save found items which match RE above
if found_items:
split_items = found_items[0].split('=')[1].split('[[') #split items found
items = []
for item in split_items:
item = item.strip() #trim spaces
items = re.findall(r'[\w\s]+[^\],][\w\s\.\|\)\-\,]+', item) #find and store the information present within brackets
print(propertyid + ": " + items[len(items)-1]) #print property id and property value
page = pywikibot.Page(enwiki, "Chromium_(web_browser)")
print(page)
findInception(page) #find inception of software
findprop('screenshot', 'P18', page) #find image link
findprop('logo', 'P154', page) #find logo image link
findprop('author', 'P178', page) #find developed by
findprop('author', 'P112', page) #find founded by
findprop('name', 'P138', page) #find named after
print("\n")
page2 = pywikibot.Page(enwiki, "DragonFly_BSD")
print(page2)
findInception(page2) #find inception of software
findInstanceOf(page2) #find instance of value
findprop('website', 'P856', page2) #find website link
findprop('logo', 'P18', page2) #find logo image link
findprop('developer', 'P178', page2) #find developer
findprop('developer', 'P112', page2) #find founded by
def findPropertyFromWikidata( item, propertyid ):
item_dict = item.get() #get the ItemPage in Dictionary form
try:
item_list = item_dict["claims"][propertyid] #access the property information from the dictionary under "claims"
if item_list:
for i in item_list:
it = i.getTarget() #returns a WbQuantity object with methods
print(propertyid + ": ")
try:
ii = it.get() #convert to dictionary form if possible
print(ii['labels']['en']) #print the name from the dictionary
except:
print(it) #print the Wbquantity object only
print("\n")
except:
print(propertyid + ": Not Found\n") #If the propertyid is not found
"""BONUS TASK"""
print("\n\nPrinting the same properties directly from wikidata\n")
site = pywikibot.Site("wikidata", "wikidata")
repo = site.data_repository()
item = pywikibot.ItemPage(repo, "Q48524")
item_dict = item.get() #get the ItemPage in Dictionary form
print(item_dict['labels']['uk']) #access the name of page
findPropertyFromWikidata(item, "P571")
findPropertyFromWikidata(item, "P18")
findPropertyFromWikidata(item, "P154")
findPropertyFromWikidata(item, "P178")
findPropertyFromWikidata(item, "P112")
findPropertyFromWikidata(item, "P138")
item2 = pywikibot.ItemPage(repo, "Q751067")
item2_dict = item2.get() #get the ItemPage in dictionary form
print(item2_dict['labels']['uk']) #access the name of the page
findPropertyFromWikidata(item2, "P571")
findPropertyFromWikidata(item2, "P31")
findPropertyFromWikidata(item2, "P856")
findPropertyFromWikidata(item2, "P18")
findPropertyFromWikidata(item2, "P178")
findPropertyFromWikidata(item2, "P112")
What I Learnt
edit1. Learnt to write RE in python.
2. Learnt the use of search(), findall(), compile(), split(), sub(), subn(), escape() functions of 're'.
3. Learnt to make dictionary from page information and access that information.
My Observations
edit1. For finding values from "Categories" I needed to change the code a little time depending on the way the values were mentioned in a page.
2. Finding values from infobox was comparatively easy and works well for most pages if the property exists. Still, one would have to check for the titles in the infobox.