How can I compute TF/IDF with SQL (BigQuery)

Asked 31/10, 2017 at 5:42 Answered 23/7, 2019 at 5:44

Solved sql google-bigquery text-analysis

I'm doing text analysis over reddit comments, and I want to calculate the TF-IDF within BigQuery.

Ferous answered 31/10, 2017 at 5:42 Comment(0)

This one might be easier to understand - takes a dataset that already has the # of words per TV station and day:

# in this query the combination of date+station represents a "document"

WITH data AS (
  SELECT *
  FROM `gdelt-bq.gdeltv2.iatv_1grams`
  WHERE DATE BETWEEN 20190601 AND 20190629
  AND station NOT IN ('KSTS', 'KDTV')
)
, word_day_station AS (
  # how many times a word is mentioned in each "document"
  SELECT word, SUM(count) counts, date, station
  FROM data
  GROUP BY 1, 3, 4
)
, day_station AS (
  # total # of words in each "document" 
  SELECT SUM(count) counts, date, station
  FROM data
  GROUP BY 2,3
)
, tf AS (
  # TF for a word in a "document"
  SELECT word, date, station, a.counts/b.counts tf
  FROM word_day_station a
  JOIN day_station b
  USING(date, station)
)
, word_in_docs AS (
  # how many "documents" have a word
  SELECT word, COUNT(DISTINCT FORMAT('%i %s', date, station)) indocs
  FROM word_day_station
  GROUP BY 1
)
, total_docs AS (
  # total # of docs
  SELECT COUNT(DISTINCT FORMAT('%i %s', date, station)) total_docs
  FROM data
)
, idf AS (
  # IDF for a word
  SELECT word, LOG(total_docs.total_docs/indocs) idf
  FROM word_in_docs
  CROSS JOIN total_docs
)

SELECT date,
  ARRAY_AGG(STRUCT(station, ARRAY_TO_STRING(words, ', ')) ORDER BY station) top_words
FROM (
  SELECT date, station, ARRAY_AGG(word ORDER BY tfidf DESC LIMIT 5) words
  FROM (
    SELECT word, date, station, tf.tf * idf.idf tfidf
    FROM tf
    JOIN idf
    USING(word)
  )
  GROUP BY date, station
)
GROUP BY date
ORDER BY date DESC

Ferous answered 23/7, 2019 at 5:44 Comment(0)

This query works on 5 stages:

Obtain all reddit posts I'm interested in. Normalize words (LOWER, only letters and ', unescape some HTML). Split those words into an array.
Calculate the tf (term frequency) for each word in each doc - count how many times it shows up in each doc, relative to the number of words in said doc.
For each word, calculate the number of docs that contain it.
From (3.), obtain idf (inverse document frequency): "inverse fraction of the documents that contain the word, obtained by dividing the total number of documents by the number of documents containing the term, and then taking the logarithm of that quotient"
Multiply tf*idf to obtain tf-idf.

This query manages to do this on one pass, by passing the obtained values up the chain.

#standardSQL
WITH words_by_post AS (
  SELECT CONCAT(link_id, '/', id) id, REGEXP_EXTRACT_ALL(
    REGEXP_REPLACE(REGEXP_REPLACE(LOWER(body), '&amp;', '&'), r'&[a-z]{2,4};', '*')
      , r'[a-z]{2,20}\'?[a-z]+') words
  , COUNT(*) OVER() docs_n
  FROM `fh-bigquery.reddit_comments.2017_07`  
  WHERE body NOT IN ('[deleted]', '[removed]')
  AND subreddit = 'movies'
  AND score > 100
), words_tf AS (
  SELECT id, word, COUNT(*) / ARRAY_LENGTH(ANY_VALUE(words)) tf, ARRAY_LENGTH(ANY_VALUE(words)) words_in_doc
    , ANY_VALUE(docs_n) docs_n
  FROM words_by_post, UNNEST(words) word
  GROUP BY id, word
  HAVING words_in_doc>30
), docs_idf AS (
  SELECT tf.id, word, tf.tf, ARRAY_LENGTH(tfs) docs_with_word, LOG(docs_n/ARRAY_LENGTH(tfs)) idf
  FROM (
    SELECT word, ARRAY_AGG(STRUCT(tf, id, words_in_doc)) tfs, ANY_VALUE(docs_n) docs_n
    FROM words_tf
    GROUP BY 1
  ), UNNEST(tfs) tf
)    


SELECT *, tf*idf tfidf
FROM docs_idf
WHERE docs_with_word > 1
ORDER BY tfidf DESC
LIMIT 1000

Ferous answered 31/10, 2017 at 5:42 Comment(4)

I can be wrong but somehow I feel that in REGEXP_EXTRACT_ALL instead of r'[a-z]{2,20}\'?[a-z]*' you should use r'[a-z]{2,20}\'?[a-z]+' – Kordofanian 31/10, 2017 at 6:42

Thanks for checking @MikhailBerlyant! The difference would be if words can end on a '? – Ferous 1/11, 2017 at 6:39

That is exactly what I thought - so check what if comment has word surraunding apostrophy - like 'abc' for example :o) – Kordofanian 1/11, 2017 at 13:13

I was successful in getting this to work on a completely different data set. Thank you! Now I am trying to add stemming into the mix. If you have a stemming version, I'd love to see it. My plan is to use a simple dictionary stemmer and do a join to replace the 'word' with a 'stem'. Any suggestions on better approaches? – Piste 22/9, 2018 at 16:19

Stack Overflow dataset version:

#standardSQL
WITH words_by_post AS (
  SELECT id, REGEXP_EXTRACT_ALL(
       REGEXP_REPLACE(
       REGEXP_REPLACE(
       REGEXP_REPLACE(
         LOWER(CONCAT(title, ' ', body))
         , r'&amp;', '&')
         , r'&[a-z]*;', '')
         , r'<[= \-:a-z0-9/\."]*>', '')
      , r'[a-z]{2,20}\'?[a-z]+') words
     , title, body
  , COUNT(*) OVER() docs_n
  FROM `bigquery-public-data.stackoverflow.posts_questions`  
  WHERE score >= 150
), words_tf AS (
  SELECT id, words
  , ARRAY(
     SELECT AS STRUCT w word, COUNT(*)/ARRAY_LENGTH(words) tf
     FROM UNNEST(words) a 
     JOIN (SELECT DISTINCT w FROM UNNEST(words) w) b 
     ON a=b.w 
     WHERE w NOT IN ('the', 'and', 'for', 'this', 'that', 'can', 'but') 
     GROUP BY word ORDER BY word
   ) tfs
   , ARRAY_LENGTH((words)) words_in_doc
   , docs_n
   , title, body
  FROM words_by_post
  WHERE ARRAY_LENGTH(words)>20
), docs_idf AS (
  SELECT *, LOG(docs_n/docs_with_word) idf
  FROM (
    SELECT id, word, tf.tf, COUNTIF(word IN UNNEST(words)) OVER(PARTITION BY word) docs_with_word, docs_n
     , title, body
    FROM words_tf, UNNEST(tfs) tf
  )
)    


SELECT id, ARRAY_AGG(STRUCT(word, tf*idf AS tf_idf, docs_with_word) ORDER BY tf*idf DESC) tfidfs
#  , ANY_VALUE(title) title, ANY_VALUE(body) body # makes query slower
FROM docs_idf
WHERE docs_with_word > 1
GROUP BY 1

Improvements versus the previous answer: One less GROUP BY across the dataset is needed, helping the query run faster.

Ferous answered 7/11, 2017 at 23:1 Comment(0)

This one might be easier to understand - takes a dataset that already has the # of words per TV station and day:

# in this query the combination of date+station represents a "document"

WITH data AS (
  SELECT *
  FROM `gdelt-bq.gdeltv2.iatv_1grams`
  WHERE DATE BETWEEN 20190601 AND 20190629
  AND station NOT IN ('KSTS', 'KDTV')
)
, word_day_station AS (
  # how many times a word is mentioned in each "document"
  SELECT word, SUM(count) counts, date, station
  FROM data
  GROUP BY 1, 3, 4
)
, day_station AS (
  # total # of words in each "document" 
  SELECT SUM(count) counts, date, station
  FROM data
  GROUP BY 2,3
)
, tf AS (
  # TF for a word in a "document"
  SELECT word, date, station, a.counts/b.counts tf
  FROM word_day_station a
  JOIN day_station b
  USING(date, station)
)
, word_in_docs AS (
  # how many "documents" have a word
  SELECT word, COUNT(DISTINCT FORMAT('%i %s', date, station)) indocs
  FROM word_day_station
  GROUP BY 1
)
, total_docs AS (
  # total # of docs
  SELECT COUNT(DISTINCT FORMAT('%i %s', date, station)) total_docs
  FROM data
)
, idf AS (
  # IDF for a word
  SELECT word, LOG(total_docs.total_docs/indocs) idf
  FROM word_in_docs
  CROSS JOIN total_docs
)

SELECT date,
  ARRAY_AGG(STRUCT(station, ARRAY_TO_STRING(words, ', ')) ORDER BY station) top_words
FROM (
  SELECT date, station, ARRAY_AGG(word ORDER BY tfidf DESC LIMIT 5) words
  FROM (
    SELECT word, date, station, tf.tf * idf.idf tfidf
    FROM tf
    JOIN idf
    USING(word)
  )
  GROUP BY date, station
)
GROUP BY date
ORDER BY date DESC

Ferous answered 23/7, 2019 at 5:44 Comment(0)

Recommended topics

Hot tags