import json # a "module" for processing text representing nested structures
from string import maketrans # a "method" to manipulate strings
from collections import defaultdict # a "data structure" we'll use for analysis

tweets = [] # an empty list to hold tweets
tweet_file = open("twitter_genetics.txt") # twitter stream data we downloaded must be open before reading
for tweet in tweet_file: # go over each line in the file
    if len(tweet) > 2: # interruptions in the stream leave empty lines we want to skip
        tweets.append(json.loads(tweet)) # parse the nested structure of the tweet

tweet_file.close() # don't forget to close the file! Otherwise it takes up computing resources

delimiter_deleter = maketrans("-,;:)(","      ") # a table for changing delimiters to spaces
word_counter = defaultdict(lambda: 0) # an empty data structure with a default zero value

for tweet in tweets: # go through the tweets we parsed
    if "text" in tweet.keys(): # does the tweet have a text file? Most cases will
        encoded_tweet_text = tweet["text"].encode("utf-8)") # enable rendered of unicode characters
        for word in encoded_tweet_text.translate(delimiter_deleter).split():
            if len(word) > 3: # smaller words like "to" or "the" are not very interesting
                word_counter[word] += 1 # count the words

for word in sorted(word_counter.keys(), key = word_counter.get,reverse=True)[:10]: # sort words by frequency
    print word + "\t" + str(word_counter[word]) # print out the word and its frequency

for tweet in tweets:
    if "text" in tweet.keys():
        encoded_tweet_text = tweet["text"].encode("utf-8)")
        if "California" in encoded_tweet_text: # only pull out tweets about California
            print encoded_tweet_text