import json # a "module" for processing text representing nested structures from string import maketrans # a "method" to manipulate strings from collections import defaultdict # a "data structure" we'll use for analysis tweets = [] # an empty list to hold tweets tweet_file = open("twitter_genetics.txt") # twitter stream data we downloaded must be open before reading for tweet in tweet_file: # go over each line in the file if len(tweet) > 2: # interruptions in the stream leave empty lines we want to skip tweets.append(json.loads(tweet)) # parse the nested structure of the tweet tweet_file.close() # don't forget to close the file! Otherwise it takes up computing resources delimiter_deleter = maketrans("-,;:)("," ") # a table for changing delimiters to spaces word_counter = defaultdict(lambda: 0) # an empty data structure with a default zero value for tweet in tweets: # go through the tweets we parsed if "text" in tweet.keys(): # does the tweet have a text file? Most cases will encoded_tweet_text = tweet["text"].encode("utf-8)") # enable rendered of unicode characters for word in encoded_tweet_text.translate(delimiter_deleter).split(): if len(word) > 3: # smaller words like "to" or "the" are not very interesting word_counter[word] += 1 # count the words for word in sorted(word_counter.keys(), key = word_counter.get,reverse=True)[:10]: # sort words by frequency print word + "\t" + str(word_counter[word]) # print out the word and its frequency for tweet in tweets: if "text" in tweet.keys(): encoded_tweet_text = tweet["text"].encode("utf-8)") if "California" in encoded_tweet_text: # only pull out tweets about California print encoded_tweet_text