#!/usr/bin/python # -*- coding: iso-8859-1 -*- # author: M. Luz Congosto. # Creative commons 3.0 spain # http://creativecommons.org/licenses/by-nc/3.0/es/ import os import re import sys import time import socket import urllib import urllib2 import httplib import threading urls_rank={} urls_large={} urls_errors=0 """ This script extracts the content of a set of tweets: It is assumed: 1. Tweets were extracted from tweetbackup.com in text format. A tweet has this format: 2010-05-03 18:08:46 data is not cheap RT @vrandezo "We entered an age where data is cheap, but making sense of it is not" - @zephoria #www2010 / @justgrimes usage: tweets_info.py filename [--filter filename] [--all] [--authors_count] [--authors_list] [--tweets_by_day] [--tweets_by_hour] [--hashtag_count] [--hashtag_list] [--words_count] [--words_list] [--urls] Options: --filename: name of the input file with the tweets. --filter filename: name of the filter file with a list of words to be filtered --all: It invokes all options of the command --authors_count: It extracts the unique authors list and count the number of tweets written by each one. The output is written in the file {mamefile}_authors_count.csv --authors_list: It extracts the authors list in order of appearance. The output is written in the file {mamefile}_authors_list.csv --tweets_by_day: It extracts the number of tweets written by day. The output is written in the file {mamefile}_tweets_by_day.csv --tweets_by_hour: It extracts the number of tweets by the hour. The output is written in the file {mamefile}_tweets_by_hour.csv --hashtag_count: Extract the unique hashtag list and count the number of occurrences of each one. The output is written in the file {mamefile}_hashtag_count.csv --hashtag_list: It extracts the hashtags list in order of appearance. The output is written in the file {mamefile}_hashtag_list.csv --words_count: It extracts the unique words list and count the number of occurrences of each one. If the filter words file exists [--filter filename], then it will be filtered. The output is written in the file {mamefile}_words_list.csv --words_list: it extracts the words list in order of appearance. If the filter words file exists [--filter filename], then it will be filtered. The output is written in the file {mamefile}_words_list.csv --urls: It extracts the urls and unshorten them if they are shorten. Make a ranking of urls, sites and unshorten sites by number of occurrences. The output is written in these files: {mamefile}_urls_translates.csv {mamefile}_shorten_sites.csv {mamefile}_large_urls.csv {mamefile}_sites.csv {mamefile}_shorten_sites.csv """ class thread_get_url (threading.Thread): def __init__(self, url): threading.Thread.__init__(self) self.url = url self.url_large='none' self.error=False self.info=[] def run(self): global urls_large global urls_rank global urls_errors lock = threading.Lock() timeout = 10 socket.setdefaulttimeout(timeout) self.error=False try: ufile = urllib2.urlopen(self.url) ## get file-like object for url except: print 'URLError:\t%s' % (self.url) lock.acquire(True) urls_errors = urls_errors+urls_rank[self.url] lock.release() self.error=True if self.error == False: self.info = ufile.info() ## meta-info about the url content self.url_large= ufile.geturl() lock.acquire(True) urls_large[self.url]= self.url_large lock.release() return def get_urls(file_in,prefix_file_out): sites_pages=[] ufile=[] list_urls=[] sites_rank={} shorten_sites_rank={} urls_short={} global urls_large global urls_rank urls_short_count=0 urls_large_count=0 urls_count=0 urls_count_unique=0 global urls_errors f = open(file_in, 'rU') sites_pages=re.findall (r"\s(http://[\w\.-]+)([\w\/\&\%\$\-_\?\=\@\+\.;'!]+)", f.read()) # Extract and count uniques urls for site_page in sites_pages: (site,page)=site_page url=site+page urls_count = urls_count+1 if url not in urls_rank: urls_rank[url]=1 list_urls.append(url) urls_count_unique=urls_count_unique+1 else: urls_rank[url]= urls_rank[url]+1 # Expand uniques urls in parallel to shorten time while len (list_urls) > 0: for i in range(0, 100): if len(list_urls) == 0: break url=list_urls.pop(0) thread = thread_get_url(url) thread.start() threads = threading.enumerate() while len(threads) >1: threads = threading.enumerate() time.sleep(1) #find out if urls are short or large for url in urls_large: url_large=urls_large[url] if url == url_large: urls_large_count= urls_large_count + urls_rank[url] elif len(url)+1 == len(url_large) and url == url_large[:-1]: urls_large_count= urls_large_count + urls_rank[url] else: ## Short Url urls_short[url]=url urls_short_count= urls_short_count + urls_rank[url] #extract shorten site site=re.search (r"http[s]*://[\w\.-]+", url) if site: if site.group() not in shorten_sites_rank: shorten_sites_rank[site.group() ]=urls_rank[url] else: shorten_sites_rank[site.group() ]= shorten_sites_rank[site.group() ] + urls_rank[url] #extract site site=re.search (r"http[s]*://[\w\.-]+", url_large) if site: if site.group() not in sites_rank: sites_rank[site.group() ]= urls_rank[url] else: sites_rank[site.group() ]= sites_rank[site.group() ] + urls_rank[url] # Write results # Translated Urls f_out= open(prefix_file_out+'_urls_translates.csv', 'w') f_out.write ( "\tUrl\tUrl Translate\tRepeat\n") for url in sorted(urls_large.keys()): f_out.write ( '%s\t%s\t%s\n' % (url, urls_large[url],urls_rank[url])) f_out.close() # Shorten Urls f_out= open(prefix_file_out+'_shorten_urls.csv', 'w') f_out.write ("Shorten url\tRepeat\n") for url in sorted(urls_short.keys()): f_out.write ( '%s\t%s\n' % (url,urls_rank[url])) f_out.close() #Large urls rakning f_out= open(prefix_file_out+'_large_urls.csv', 'w') f_out.write ( "Large url\tRepeat\n") for url in sorted(urls_large.keys()): f_out.write ( '%s\t%s\n' % (urls_large[url],urls_rank[url])) f_out.close() #Sites ranking f_out= open(prefix_file_out+'_sites.csv', 'w') f_out.write ( "Site\tRepeat\n" ) for site in sorted(sites_rank.keys()): f_out.write ( '%s\t%s\n' % (site,sites_rank[site])) f_out.close() #Shorten sites ranking f_out= open(prefix_file_out+'_shorten_sites.csv', 'w') f_out.write ( "Shorten site \tRepeat\n" ) for site in sorted(shorten_sites_rank.keys()): f_out.write ( '%s\t%s\n' % (site,shorten_sites_rank[site])) f_out.close() print "Statistics" print 'number of Urls \t%s' % (urls_count) print 'number unique urls \t%s' % (urls_count_unique) print 'number of short urls\t%s' % (urls_short_count) print 'number of large urls\t%s' % (urls_large_count) print 'number of wrong urls\t%s' % (urls_errors) f.close() return def list_authors(file_in,file_out): f = open(file_in, 'rU') texto=f.read() authors=re.findall (r'\s/\s(@\w+)', texto) f_out= open(file_out, 'w') for author in authors: f_out.write ('%s\n' % (author)) f_out.close() return def count_authors(file_in,file_out): authors_rank={} f = open(file_in, 'rU') texto=f.read() authors=re.findall (r'\s/\s(@\w+)', texto) for author in authors: if author not in authors_rank: authors_rank[author]=1 else: authors_rank[author]= authors_rank[author]+1 f_out= open(file_out, 'w') f_out.write ("Authors\tRepeat\n") for author in sorted(authors_rank.keys()): f_out.write ( '%s\t%s\n' % (author, authors_rank[author])) f_out.close() f.close() return def extract_days_hours (file_in, days_hours): f = open(file_in, 'rU') text=f.read() days_hours=re.findall (r'\n(\d\d\d\d-\d\d-\d\d)\s(\d\d):\d\d:\d\d\n', text) f.close() return days_hours def count_tweets_day(file_in,file_out): days_hours=[] tweets_day_rank={} days_hours= extract_days_hours(file_in, days_hours) for days_hour in days_hours: (day,hour)=days_hour if day not in tweets_day_rank: tweets_day_rank[day]=1 else: tweets_day_rank[day]= tweets_day_rank[day]+1 f_out= open(file_out, 'w') f_out.write ("N. Tweets\tDay\n") for day in sorted(tweets_day_rank.keys()): f_out.write ('%s\t%s\n' % (day, tweets_day_rank[day])) f_out.close() return def count_tweets_hour(file_in,file_out): days_hours=[] tweets_hour_rank={} days_hours= extract_days_hours(file_in, days_hours) for days_hour in days_hours: (day,hour)=days_hour if hour not in tweets_hour_rank: tweets_hour_rank[hour]=1 else: tweets_hour_rank[hour]= tweets_hour_rank[hour]+1 f_out= open(file_out, 'w') f_out.write ("N. Tweets\tHour\n") for hour in sorted(tweets_hour_rank.keys()): f_out.write ('%s\t%s\n' % (hour, tweets_hour_rank[hour])) f_out.close() return def count_hashtag(file_in,file_out): hashtags=[] hashtags_rank={} f = open(file_in, 'rU') texto=f.read() hashtags=re.findall (r'#\w+', texto) for hashtag in hashtags: hashtag_lower= hashtag.lower() if hashtag_lower not in hashtags_rank: hashtags_rank[hashtag_lower] =1 else: hashtags_rank [hashtag_lower] = hashtags_rank [hashtag_lower] + 1 f_out= open(file_out, 'w') f_out.write ("Hashtag\tRepeat\n") for hashtag in sorted(hashtags_rank.keys()): f_out.write ('%s\t%s\n' % (hashtag,hashtags_rank[hashtag])) f_out.close() f.close() return def list_hashtag(file_in,file_out): hashtags=[] list_hashtags=[] f = open(file_in, 'rU') texto=f.read() hashtags=re.findall (r'#\w+', texto) f_out= open(file_out, 'w') for hashtag in hashtags: hashtag_lower= hashtag.lower() f_out.write ('%s ' % (hashtag_lower)) f_out.close() f.close() return def count_words(file_in,file_out,file_filter): words=[] words_rank={} filters=[] try: f = open(file_filter, 'rU') filters= re.findall(r'\w+',f.read()) f.close() except: print "Filter words file doesn't exist" f = open(file_in, 'rU') words=re.findall (r'\s([a-zA-Z][a-zA-Z]+)\s', f.read()) for word in words: word= word.lower() if word not in filters: if word not in words_rank: words_rank[word] =1 else: words_rank [word] = words_rank [word] + 1 f_out= open(file_out, 'w') f_out.write ("Word\tRepeat\n") for word in sorted(words_rank.keys()): f_out.write ('%s\t%s\n' % (word,words_rank[word])) f_out.close() f.close() return def list_words(file_in,file_out,file_filter): words=[] list_words=[] filters=[] try: f = open(file_filter, 'rU') filters= re.findall(r'\w+',f.read()) f.close() except: print "Filter words file doesn't exist" f = open(file_in, 'rU') f_out= open(file_out, 'w') words=re.findall (r'\s([a-zA-Z][a-zA-Z]+)\s',f.read() ) for word in words: word= word.lower() if word not in filters: f_out.write ('%s ' % (word)) f_out.close() f.close() return def main(): filter_words="" args = sys.argv[1:] if not args: print 'usage: filename [--filter filename] [--all] [--authors_count] [--authors_list] [--tweets_by_day] [--tweets_by_hour] [--hashtag_count] [--hashtag_list] [--words_count] [--words_list] [--urls]' sys.exit(1) file_in=args.pop(0) filename=re.search (r"([\w-]+)\.([\w]*)", file_in) if not filename: print "bad filename",file_in exit (1) name=filename.group(0) if not args: print "there insn't any option" sys.exit(1) while len (args) > 0: arg=args.pop(0) if arg == '--filter': filter_words= args.pop(0) print '-->Filter words file: ', filter_words elif arg == '--all': args.append('--authors_list') args.append('--authors_count') args.append('--tweets_by_day') args.append('--tweets_by_hour') args.append('--hashtag_count') args.append('--hashtag_list') args.append('--words_count') args.append('--words_list') args.append('--urls') elif arg == '--authors_list': file_out=name +'_authors_list' print'..extracting authors list' list_authors (file_in,file_out) elif arg == '--authors_count': file_out=name +'_authors count' print'..extracting uthors_Count' count_authors (file_in,file_out) elif arg == '--tweets_by_day': file_out=name +'_tweets_by_day' print'..extracting tweets by day' count_tweets_day (file_in,file_out) elif arg == '--tweets_by_hour': file_out=name +'_tweets_by_hour' print'..extracting tweets by hour' count_tweets_hour (file_in,file_out) elif arg == '--hashtag_count': file_out=name +'_hashtag_count' print'..extracting hashtag count' count_hashtag (file_in,file_out) elif arg == '--hashtag_list': file_out=name +'_hashtag_list' print'..extracting hashtag list' list_hashtag (file_in,file_out) elif arg == '--words_count': file_out=name +'_words_count' print'..extracting words count' count_words(file_in,file_out,filter_words) elif arg == '--words_list': file_out=name +'_words_list' print'..extracting words list' list_words (file_in,file_out,filter_words) elif arg == '--urls': prefix_file_out=name print'..extracting urls, it could take some time' get_urls(file_in,prefix_file_out) exit(0) if __name__ == '__main__': main()