User:Daniel Mietchen/Sandbox4URLshortening

This page is used in conjunction with m:Special:UrlShortener as a workaround to https://phabricator.wikimedia.org/T220703 . URL shortening can also be triggered via the MediaWiki API. Another option for URL shortening is Query Chest.

The following query uses these:

  • Properties: published in (P1433)  View with Reasonator View with SQID, main subject (P921)  View with Reasonator View with SQID, title (P1476)  View with Reasonator View with SQID, KIT Linked Open Numbers ID (P5176)  View with Reasonator View with SQID, numeric value (P1181)  View with Reasonator View with SQID
    # Most frequent n-grams from a random set of publications in the Gartenlaube which are missing main subject tags
    SELECT 
    
    DISTINCT ?Ngram ?N ?Count ?Length ?Dashes ?Score ?ExamplePub ?ExamplePubTitle
    
    WITH
    { # Generating a list of entities to be analyzed
      SELECT ?Publication
       { 
          SERVICE bd:sample { ?Publication wdt:P1433 wd:Q655617 . bd:serviceParam bd:sample.limit 10000 }   
          FILTER NOT EXISTS { ?Publication wdt:P921 ?Schlagwort. }
    
       }
    } AS %items 
    WITH
    { # Preprocessing the titles
      SELECT ?Title ?Publication ?Seeds ?ClearTitleLength
       { 
          INCLUDE %items
          ?Publication wdt:P1476 ?Title.
          BIND (REPLACE(STR(?Title),"[\\.:,;\\[\\]\\?()$]","") AS ?ClearTitle) # remove some frequent special characters, including colons and semicolons
          BIND(STRLEN(?ClearTitle) AS ?ClearTitleLength) 
          FILTER(LANG(?Title)="de") 
          # Basic processing of the titles
          BIND ("::: ::: ::: ::: ::: ::: ::: ::: " AS ?StartCodon)
          BIND (" ;;; ;;; ;;; ;;; ;;; ;;; ;;; ;;;" AS ?StopCodon)
          BIND (LCASE(CONCAT(?StartCodon , # add start codon of colons to assist with processing of n-grams at beginning of title
                                ?ClearTitle, 
                                ?StopCodon)) # add stop codon of semicolons to assist with processing of n-grams at end of title
                         AS ?Seeds )
       }
    } AS %titles 
    WITH
    { # Generating a list of regexes to look for the NumericValue-th word in a string     
      # Based on https://w.wiki/KG$ by Jura1
      SELECT ?Regex1 ?Regex2 ?Regex3 ?Regex4 ?NumericValue 
        { 
          ?NumberItem wdt:P5176 []; wdt:P1181 ?NumericValue . 
          FILTER( ?NumericValue > 0 ) 
          FILTER( ?NumericValue < 151)
          BIND("^([^ ]+ ){" AS ?RegexStart)
          BIND("}([^ ]+) .*" AS ?RegexEnd)
          BIND( CONCAT( ?RegexStart , STR( ?NumericValue - 1 ), ?RegexEnd ) AS ?Regex1)
          BIND( CONCAT( ?RegexStart , STR( ?NumericValue + 1 ), ?RegexEnd ) AS ?Regex2) 
          BIND( CONCAT( ?RegexStart , STR( ?NumericValue + 3 ), ?RegexEnd ) AS ?Regex3) 
          BIND( CONCAT( ?RegexStart , STR( ?NumericValue + 5 ), ?RegexEnd ) AS ?Regex4) 
        }
    } AS %regexes 
    WITH
    { # Applying the regexes to the titles to extract ngrams (for n <= 8), and counting occurrences of the ngrams across titles
      SELECT 
        DISTINCT ?Ngram 
        ?N
        (COUNT(DISTINCT ?Title) AS ?Count)
        ?Length
        ?Dashes
        (( ?Count * ?Length * ( (?Dashes +1) / ?N) 
         ) AS ?Score)
        (SAMPLE(DISTINCT ?Publication) AS ?ExamplePub)
          { 
            INCLUDE %regexes
            INCLUDE %titles
            BIND( 
              (CONCAT(
                REPLACE(?Seeds, ?Regex1, "$1"), " ", 
                REPLACE(?Seeds, ?Regex1, "$2"), " ", 
                REPLACE(?Seeds, ?Regex2, "$1"), " ", 
                REPLACE(?Seeds, ?Regex2, "$2"), " ", 
                REPLACE(?Seeds, ?Regex3, "$1"), " ", 
                REPLACE(?Seeds, ?Regex3, "$2"), " ", 
                REPLACE(?Seeds, ?Regex4, "$1"), " ", 
                REPLACE(?Seeds, ?Regex4, "$2")
              )
            ) AS ?NgramCandidate) 
                                
            BIND( 
              (REPLACE
               (REPLACE
                (REPLACE
                 (REPLACE
                  (STR(?NgramCandidate),"([;:])",""),
                  "(^\\s+)",""),
                 "(\\s+$)",""),
                "([ ]{2,})"," ")
              ) AS ?Ngram) 
    
            BIND(STRLEN(?Ngram) AS ?Length) 
            FILTER (?Length > 3 )  
            FILTER (?Length <= ?ClearTitleLength )  
    
            BIND(STRLEN(REPLACE(?Ngram, "\\S", "")) + 1 as ?N)
            BIND((STRLEN(?Ngram) - STRLEN(REPLACE(?Ngram, "-", "")))  as ?Dashes)
          }
      GROUP BY ?Ngram ?N ?Count ?Length ?Dashes ?Score ?ExamplePub
    #   HAVING(?Count > 1)
    } AS %ngrams 
    WHERE {
      INCLUDE %ngrams 
      # Exclude Ngrams starting or ending with any of a set of blacklisted words
      BIND("(a|and|between|during|for|from|in|of|on|or|the|to|with)" AS ?blacklist)
      BIND( CONCAT( "(^", ?blacklist ,")+( )+") AS ?RegexBlackStart)
      BIND( CONCAT( "( )+(", ?blacklist ,")+$") AS ?RegexBlackEnd)
      FILTER (!REGEX(?Ngram, ?RegexBlackStart))
      FILTER (!REGEX(?Ngram, ?RegexBlackEnd))
    
    #   # Exclude Ngrams too similar to the target
    #   FILTER (!CONTAINS(?Ngram, "climate"))
    #   FILTER (!CONTAINS(?Ngram, "change"))
              
      ?ExamplePub wdt:P1476 ?ExamplePubTitle.
      FILTER(LANG(?ExamplePubTitle)="de") 
    }
    GROUP BY ?Ngram ?N ?Count ?Length ?Dashes ?Score ?ExamplePub ?ExamplePubTitle
    ORDER BY DESC(?Score) DESC(?Count) DESC(?Length)
    LIMIT 200