Storing Tweets and Summarising

So in the last update we had a program that listened to twitter and then displayed all the tweets that matched the search terms. Now at that point it want much better than going to twitter and searching for all of your search terms and reading, and I dont know about you but I don’t think I have time to read 1000’s of tweets a day (on a randomly chosen test day I got 45,162 tweets that matched my search terms)

So I have made a few more changes to the code so that it outputs the tweets to a few files and collates the results.

The output files are stored in a directory tree sorted by date eg YYYY/MM/DD/results, and saves one file containing all the tweets, a folder with individual files for each seach term, a folder for files containing tweets which match two or search terms, and finally two files that summarise the results, telling you how many tweets matched each term and pair of terms.

Now to try and get the code to email me when its done for the day.

import time
from datetime import datetime
from twython import Twython
from twython import TwythonStreamer
import os

#Search terms

TERMS = ['#GaN','#gan', '#physics','#Physics', '#LEDs','#LED','#journorequest','#Science','#science','#manchester','#Manchester','#cambridge','#Cambridge']

APP_KEY = 'xxxxxxxxxxxxxxxxxxxxxxxxx'
APP_SECRET = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
OAUTH_TOKEN = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
OAUTH_TOKEN_SECRET = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
starttime=time.time()
runtime=60*60*24

class MyStreamer(TwythonStreamer):
    def on_success(self, data):
    	if time.time() < starttime+runtime:
	        if 'text' in data:
        	    print data['user']['screen_name']
        	    tweets.append([ data['user']['screen_name'], data['text'].encode('utf-8'), datetime.fromtimestamp(int(data['timestamp_ms'])/1000.0)])
	else:
		self.disconnect()
    def on_error(self, status_code, data):
        print status_code
        self.disconnect()

def finishup():
	print 'Finishing up'
	timstr = time.localtime(starttime)
	directory = str(timstr[0])+'/'+str(timstr[1])+'/'+str(timstr[2])+'/'
	if not os.path.exists(directory):
		os.makedirs(directory)
	if not os.path.exists(directory+'terms/'):
		os.makedirs(directory+'terms/')
	if not os.path.exists(directory+'pairs'):
		os.makedirs(directory+'pairs/')
	stream.disconnect()
	print 'Saving Tweets'
	f = open(directory+'tweets.txt', 'w')
	f.write('User\t"Tweet"\tDate-Time-Stamp\n')
	for tweet in tweets:
		f.write(str(tweet[0])+ '\t"'+ str(tweet[1])+ '"\t'+ str(tweet[2]) +'\n')
	results = []
	f.close()
	
	print 'Saving tweets per term'
		
	for term in TERMS:
		f=open(directory+'terms/'+str(term)+'.txt','w')
		result = 0
		for tweet in tweets:
			if term in tweet[1]:
				f.write(str(tweet[0])+ '\t"'+ str(tweet[1])+ '"\t'+ str(tweet[2]) +'\n')
				result +=1
		results.append(result)
		f.close()
	i = 0
	
	print 'Saving results per term'
	
	f=open(directory + 'Results per Term.txt','w')
	for term in TERMS:
		f.write(str(term)+ '\t' + str(results[i])+'\n')
		i += 1
	f.close()
	
	print 'Saving the tweets per pai'
	
	pairs = []
	i = 0
	for term in TERMS:
		j = 0
		for term2 in TERMS:
			if (j>i):
				if(term2 != term):
					f=open(directory+'pairs/'+str(term)+'+'+str(term2)+'.txt','w')
					result = 0
					for tweet in tweets:
						if term in tweet[1]:
							if term2 in tweet[1]:
								f.write(str(tweet[0])+ '\t"'+ str(tweet[1])+ '"\t'+ str(tweet[2]) +'\n')
					f.close()
					if result == 0:
						os.remove(directory+'pairs/'+str(term)+'+'+str(term2)+'.txt')
					pairs.append([term,term2,result])
			j+=1
		i+= 1
				
	
	print 'Saving the results per pair'
	f=open(directory + 'Results per pair.txt','w')
	for pair in pairs:
		f.write(str(pair[0])+'\t'+str(pair[1])+'\t'+str(pair[2])+'\n')
	f.close()
	
			



tweets = []
print time.localtime(starttime),time.localtime(starttime + runtime)

try:
	stream = MyStreamer(APP_KEY, APP_SECRET,OAUTH_TOKEN, OAUTH_TOKEN_SECRET)
	stream.statuses.filter(track=TERMS)
	finishup()
except KeyboardInterrupt:
	print 'Exited manually'
	finishup()

100

101

102

103

104

105

106

107

108

109

110

111

112

import time

from datetime import datetime

from twython import Twython

from twython import TwythonStreamer

import os

#Search terms

TERMS = ['#GaN','#gan', '#physics','#Physics', '#LEDs','#LED','#journorequest','#Science','#science','#manchester','#Manchester','#cambridge','#Cambridge']

APP_KEY = 'xxxxxxxxxxxxxxxxxxxxxxxxx'

APP_SECRET = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'

OAUTH_TOKEN = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'

OAUTH_TOKEN_SECRET = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'

starttime=time.time()

runtime=60*60*24

class MyStreamer(TwythonStreamer):

def on_success(self, data):

if time.time() < starttime+runtime:

if 'text' in data:

print data['user']['screen_name']

tweets.append([ data['user']['screen_name'], data['text'].encode('utf-8'), datetime.fromtimestamp(int(data['timestamp_ms'])/1000.0)])

else:

self.disconnect()

def on_error(self, status_code, data):

print status_code

self.disconnect()

def finishup():

print 'Finishing up'

timstr = time.localtime(starttime)

directory = str(timstr[0])+'/'+str(timstr[1])+'/'+str(timstr[2])+'/'

if not os.path.exists(directory):

os.makedirs(directory)

if not os.path.exists(directory+'terms/'):

os.makedirs(directory+'terms/')

if not os.path.exists(directory+'pairs'):

os.makedirs(directory+'pairs/')

stream.disconnect()

print 'Saving Tweets'

f = open(directory+'tweets.txt', 'w')

f.write('User\t"Tweet"\tDate-Time-Stamp\n')

for tweet in tweets:

f.write(str(tweet[0])+ '\t"'+ str(tweet[1])+ '"\t'+ str(tweet[2]) +'\n')

results = []

f.close()

print 'Saving tweets per term'

for term in TERMS:

f=open(directory+'terms/'+str(term)+'.txt','w')

result = 0

for tweet in tweets:

if term in tweet[1]:

f.write(str(tweet[0])+ '\t"'+ str(tweet[1])+ '"\t'+ str(tweet[2]) +'\n')

result +=1

results.append(result)

f.close()

i = 0

print 'Saving results per term'

f=open(directory + 'Results per Term.txt','w')

for term in TERMS:

f.write(str(term)+ '\t' + str(results[i])+'\n')

i += 1

f.close()

print 'Saving the tweets per pai'

pairs = []

i = 0

for term in TERMS:

j = 0

for term2 in TERMS:

if (j>i):

if(term2 != term):

f=open(directory+'pairs/'+str(term)+'+'+str(term2)+'.txt','w')

result = 0

for tweet in tweets:

if term in tweet[1]:

if term2 in tweet[1]:

f.write(str(tweet[0])+ '\t"'+ str(tweet[1])+ '"\t'+ str(tweet[2]) +'\n')

f.close()

if result == 0:

os.remove(directory+'pairs/'+str(term)+'+'+str(term2)+'.txt')

pairs.append([term,term2,result])

j+=1

i+= 1

print 'Saving the results per pair'

f=open(directory + 'Results per pair.txt','w')

for pair in pairs:

f.write(str(pair[0])+'\t'+str(pair[1])+'\t'+str(pair[2])+'\n')

f.close()

tweets = []

print time.localtime(starttime),time.localtime(starttime + runtime)

try:

stream = MyStreamer(APP_KEY, APP_SECRET,OAUTH_TOKEN, OAUTH_TOKEN_SECRET)

stream.statuses.filter(track=TERMS)

finishup()

except KeyboardInterrupt:

print 'Exited manually'

finishup()

Simon Hammersley - Projects Blog - Featuring Matt Davies

Storing Tweets and Summarising

Leave a Reply Cancel reply

Share This:

Leave a Reply Cancel reply