So in the last update we had a program that listened to twitter and then displayed all the tweets that matched the search terms. Now at that point it want much better than going to twitter and searching for all of your search terms and reading, and I dont know about you but I don’t think I have time to read 1000’s of tweets a day (on a randomly chosen test day I got 45,162 tweets that matched my search terms)
So I have made a few more changes to the code so that it outputs the tweets to a few files and collates the results.
The output files are stored in a directory tree sorted by date eg YYYY/MM/DD/results, and saves one file containing all the tweets, a folder with individual files for each seach term, a folder for files containing tweets which match two or search terms, and finally two files that summarise the results, telling you how many tweets matched each term and pair of terms.
Now to try and get the code to email me when its done for the day.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import time from datetime import datetime from twython import Twython from twython import TwythonStreamer import os #Search terms TERMS = ['#GaN','#gan', '#physics','#Physics', '#LEDs','#LED','#journorequest','#Science','#science','#manchester','#Manchester','#cambridge','#Cambridge'] APP_KEY = 'xxxxxxxxxxxxxxxxxxxxxxxxx' APP_SECRET = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' OAUTH_TOKEN = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' OAUTH_TOKEN_SECRET = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' starttime=time.time() runtime=60*60*24 class MyStreamer(TwythonStreamer): def on_success(self, data): if time.time() < starttime+runtime: if 'text' in data: print data['user']['screen_name'] tweets.append([ data['user']['screen_name'], data['text'].encode('utf-8'), datetime.fromtimestamp(int(data['timestamp_ms'])/1000.0)]) else: self.disconnect() def on_error(self, status_code, data): print status_code self.disconnect() def finishup(): print 'Finishing up' timstr = time.localtime(starttime) directory = str(timstr[0])+'/'+str(timstr[1])+'/'+str(timstr[2])+'/' if not os.path.exists(directory): os.makedirs(directory) if not os.path.exists(directory+'terms/'): os.makedirs(directory+'terms/') if not os.path.exists(directory+'pairs'): os.makedirs(directory+'pairs/') stream.disconnect() print 'Saving Tweets' f = open(directory+'tweets.txt', 'w') f.write('User\t"Tweet"\tDate-Time-Stamp\n') for tweet in tweets: f.write(str(tweet[0])+ '\t"'+ str(tweet[1])+ '"\t'+ str(tweet[2]) +'\n') results = [] f.close() print 'Saving tweets per term' for term in TERMS: f=open(directory+'terms/'+str(term)+'.txt','w') result = 0 for tweet in tweets: if term in tweet[1]: f.write(str(tweet[0])+ '\t"'+ str(tweet[1])+ '"\t'+ str(tweet[2]) +'\n') result +=1 results.append(result) f.close() i = 0 print 'Saving results per term' f=open(directory + 'Results per Term.txt','w') for term in TERMS: f.write(str(term)+ '\t' + str(results[i])+'\n') i += 1 f.close() print 'Saving the tweets per pai' pairs = [] i = 0 for term in TERMS: j = 0 for term2 in TERMS: if (j>i): if(term2 != term): f=open(directory+'pairs/'+str(term)+'+'+str(term2)+'.txt','w') result = 0 for tweet in tweets: if term in tweet[1]: if term2 in tweet[1]: f.write(str(tweet[0])+ '\t"'+ str(tweet[1])+ '"\t'+ str(tweet[2]) +'\n') f.close() if result == 0: os.remove(directory+'pairs/'+str(term)+'+'+str(term2)+'.txt') pairs.append([term,term2,result]) j+=1 i+= 1 print 'Saving the results per pair' f=open(directory + 'Results per pair.txt','w') for pair in pairs: f.write(str(pair[0])+'\t'+str(pair[1])+'\t'+str(pair[2])+'\n') f.close() tweets = [] print time.localtime(starttime),time.localtime(starttime + runtime) try: stream = MyStreamer(APP_KEY, APP_SECRET,OAUTH_TOKEN, OAUTH_TOKEN_SECRET) stream.statuses.filter(track=TERMS) finishup() except KeyboardInterrupt: print 'Exited manually' finishup() |