Last active
August 29, 2015 14:13
-
-
Save rajat404/51fb9f82621a5825f3c3 to your computer and use it in GitHub Desktop.
dedup project-phase2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"<center><h1><u><b>Textual Analysis for Detection & Removal of Duplicates</b></u></h1></center>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"slideshow": { | |
"slide_type": "-" | |
} | |
}, | |
"source": [ | |
"<h2><center>About</center></h2>\n", | |
"<li>The aim of this project is to find and remove duplicate or near-duplicate from text\n", | |
"<li>Here we are taking the specific case of tweets (from Twitter)\n", | |
"<li>This project is aimed to reduce the amount of redundant data we see across the internet, primarily to converse time\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"<h2>Authentication</h2>\n", | |
"We shall use the access token and API secrets in the file keys.txt" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import json\n", | |
"import twitter\n", | |
"#import urllib2\n", | |
"import requests\n", | |
"import itertools\n", | |
"import re\n", | |
"from time import time\n", | |
"from pprint import pprint\n", | |
"from hr import hr" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"authval = json.load(open(\"keys.txt\"))\n", | |
"CONSUMER_KEY = authval['CONSUMER_KEY']\n", | |
"CONSUMER_SECRET = authval['CONSUMER_SECRET']\n", | |
"OAUTH_TOKEN = authval['OAUTH_TOKEN'] \n", | |
"OAUTH_TOKEN_SECRET = authval['OAUTH_TOKEN_SECRET']\n", | |
"\n", | |
"auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,\n", | |
" CONSUMER_KEY, CONSUMER_SECRET)\n", | |
"\n", | |
"t = twitter.Twitter(auth=auth)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#Function to check the credentials of the User\n", | |
"def verify():\n", | |
" verificationDetails = t.account.verify_credentials()\n", | |
" print \"Name: \", verificationDetails['name']\n", | |
" print \"Screen Name: \", verificationDetails['screen_name']\n", | |
" \n", | |
"verify()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Name: Rajat Goyal\n", | |
"Screen Name: rajat404\n" | |
] | |
} | |
], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"testTweet = t.statuses.home_timeline()[0]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"t1 = time()\n", | |
"completeTimeline = t.statuses.home_timeline(count=10)\n", | |
"#twiterator = itertools.chain.from_iterable(itertools.repeat(completeTimeline))\n", | |
"t2 = time()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#just displaying the first tweet\n", | |
"completeTimeline[0]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 10, | |
"text": [ | |
"{u'contributors': None,\n", | |
" u'coordinates': None,\n", | |
" u'created_at': u'Wed Jan 14 06:10:17 +0000 2015',\n", | |
" u'entities': {u'hashtags': [],\n", | |
" u'symbols': [],\n", | |
" u'urls': [],\n", | |
" u'user_mentions': [{u'id': 72292440,\n", | |
" u'id_str': u'72292440',\n", | |
" u'indices': [0, 11],\n", | |
" u'name': u'Jaseem Abid',\n", | |
" u'screen_name': u'jaseemabid'},\n", | |
" {u'id': 60310843,\n", | |
" u'id_str': u'60310843',\n", | |
" u'indices': [47, 61],\n", | |
" u'name': u'Noufal Ibrahim',\n", | |
" u'screen_name': u'noufalibrahim'}]},\n", | |
" u'favorite_count': 0,\n", | |
" u'favorited': False,\n", | |
" u'geo': None,\n", | |
" u'id': 555245496944443392,\n", | |
" u'id_str': u'555245496944443392',\n", | |
" u'in_reply_to_screen_name': u'jaseemabid',\n", | |
" u'in_reply_to_status_id': 555243193776603136,\n", | |
" u'in_reply_to_status_id_str': u'555243193776603136',\n", | |
" u'in_reply_to_user_id': 72292440,\n", | |
" u'in_reply_to_user_id_str': u'72292440',\n", | |
" u'lang': u'en',\n", | |
" u'place': None,\n", | |
" u'retweet_count': 0,\n", | |
" u'retweeted': False,\n", | |
" u'source': u'<a href=\"https://about.twitter.com/products/tweetdeck\" rel=\"nofollow\">TweetDeck</a>',\n", | |
" u'text': u'@jaseemabid get a hockey mask and be creative. @noufalibrahim',\n", | |
" u'truncated': False,\n", | |
" u'user': {u'contributors_enabled': False,\n", | |
" u'created_at': u'Thu Apr 02 15:45:06 +0000 2009',\n", | |
" u'default_profile': False,\n", | |
" u'default_profile_image': False,\n", | |
" u'description': u'All Things Python | Data crunching n00b | Developer at @aksharadotorg and @KLPSays | Online at #kolkata Freenode',\n", | |
" u'entities': {u'description': {u'urls': []},\n", | |
" u'url': {u'urls': [{u'display_url': u'bibhas.in',\n", | |
" u'expanded_url': u'https://bibhas.in',\n", | |
" u'indices': [0, 23],\n", | |
" u'url': u'https://t.co/vRF6vHxadK'}]}},\n", | |
" u'favourites_count': 371,\n", | |
" u'follow_request_sent': False,\n", | |
" u'followers_count': 864,\n", | |
" u'following': True,\n", | |
" u'friends_count': 381,\n", | |
" u'geo_enabled': False,\n", | |
" u'id': 28360450,\n", | |
" u'id_str': u'28360450',\n", | |
" u'is_translation_enabled': False,\n", | |
" u'is_translator': False,\n", | |
" u'lang': u'en',\n", | |
" u'listed_count': 71,\n", | |
" u'location': u'India',\n", | |
" u'name': u'Bibhas Debnath',\n", | |
" u'notifications': False,\n", | |
" u'profile_background_color': u'171717',\n", | |
" u'profile_background_image_url': u'http://pbs.twimg.com/profile_background_images/728885297/300ef84310d96c3170c00e8a0f7baa7c.jpeg',\n", | |
" u'profile_background_image_url_https': u'https://pbs.twimg.com/profile_background_images/728885297/300ef84310d96c3170c00e8a0f7baa7c.jpeg',\n", | |
" u'profile_background_tile': True,\n", | |
" u'profile_banner_url': u'https://pbs.twimg.com/profile_banners/28360450/1413277471',\n", | |
" u'profile_image_url': u'http://pbs.twimg.com/profile_images/447833391115677696/WdBHBpjv_normal.jpeg',\n", | |
" u'profile_image_url_https': u'https://pbs.twimg.com/profile_images/447833391115677696/WdBHBpjv_normal.jpeg',\n", | |
" u'profile_link_color': u'737475',\n", | |
" u'profile_location': None,\n", | |
" u'profile_sidebar_border_color': u'FFFFFF',\n", | |
" u'profile_sidebar_fill_color': u'CCCACC',\n", | |
" u'profile_text_color': u'030303',\n", | |
" u'profile_use_background_image': True,\n", | |
" u'protected': False,\n", | |
" u'screen_name': u'bibhasdn',\n", | |
" u'statuses_count': 33289,\n", | |
" u'time_zone': u'New Delhi',\n", | |
" u'url': u'https://t.co/vRF6vHxadK',\n", | |
" u'utc_offset': 19800,\n", | |
" u'verified': False}}" | |
] | |
} | |
], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print \"Time taken to load tweets: \", t2-t1" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Time taken to load tweets: 52.4830749035\n" | |
] | |
} | |
], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#print str(completeTimeline[0]['entities']['urls'][0]['expanded_url'])" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"<h2>Sanitization</h2>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Tweets contains huge amount of metadeta. We need to extract the useful components" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from string import punctuation\n", | |
"set_punct = set(punctuation)\n", | |
"set_punct = set_punct - {\"_\", \"@\"}\n", | |
"\n", | |
"def sanitize(text, set_excludes):\n", | |
" \"\"\"\n", | |
" Return a `sanitized` version of the string `text`.\n", | |
" \"\"\"\n", | |
" text = text.lower()\n", | |
" text = \" \".join([ w for w in text.split() if not (\"http://\" in w) ])\n", | |
" letters_noPunct = [ (\" \" if c in set_excludes else c) for c in text ]\n", | |
" text = \"\".join(letters_noPunct)\n", | |
" words = text.split()\n", | |
" long_enuf_words = [w.strip() for w in words if len(w)>1]\n", | |
" return \" \".join(long_enuf_words)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 16 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print set_punct #characters that will be removed from the tweets" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"set(['!', '#', '\"', '%', '$', \"'\", '&', ')', '(', '+', '*', '-', ',', '/', '.', ';', ':', '=', '<', '?', '>', '[', ']', '\\\\', '^', '`', '{', '}', '|', '~'])\n" | |
] | |
} | |
], | |
"prompt_number": 17 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def Refine(raw_tweet):\n", | |
" simple = {}\n", | |
" simple['text'] = raw_tweet['text']\n", | |
" simple['cleanText'] = sanitize(raw_tweet['text'], set_punct)\n", | |
" simple['id'] = raw_tweet['id']\n", | |
" simple['user_screen_name'] = raw_tweet['user']['screen_name']\n", | |
" simple['urls'] = raw_tweet['entities']['urls']\n", | |
" req = None\n", | |
" try:\n", | |
" temp = (requests.get(str(raw_tweet['entities']['urls'][0]['expanded_url'])).url)\n", | |
" simple['cleanUrl'] = temp\n", | |
" except:\n", | |
" #print raw_tweet['entities']['urls']\n", | |
" simple['cleanUrl'] = None\n", | |
" return simple" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 18 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"completeTimeline[1]['entities'].keys()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 19, | |
"text": [ | |
"[u'symbols', u'user_mentions', u'hashtags', u'urls']" | |
] | |
} | |
], | |
"prompt_number": 19 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"len(completeTimeline)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 24, | |
"text": [ | |
"96" | |
] | |
} | |
], | |
"prompt_number": 24 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"completeTimeline[1]['entities']['urls']" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 20, | |
"text": [ | |
"[{u'display_url': u'bit.ly/1xX8IB8',\n", | |
" u'expanded_url': u'http://bit.ly/1xX8IB8',\n", | |
" u'indices': [30, 52],\n", | |
" u'url': u'http://t.co/KebevCzDrZ'}]" | |
] | |
} | |
], | |
"prompt_number": 20 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"refinedTweet = []\n", | |
"t1 = time()\n", | |
"for tweet in completeTimeline:\n", | |
" refinedTweet.append(Refine(tweet))\n", | |
"\n", | |
"t2 = time()\n", | |
"print \"Time taken to load tweets: \", t2-t1\n", | |
"#print json.dumps(refinedTweet, sort_keys=True, indent=2)\n", | |
"#data = json.dumps(refinedTweet)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"[]\n", | |
"[]" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"[]" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"[]\n", | |
"[]\n", | |
"[]\n", | |
"[]\n", | |
"[]\n", | |
"[]\n", | |
"[]" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"[]" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"[]" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"[]\n", | |
"[]\n", | |
"[]" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"[]\n", | |
"[]" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"[]" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"Time taken to load tweets: " | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" 214.316688061\n" | |
] | |
} | |
], | |
"prompt_number": 164 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"len(refinedTweet)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 167, | |
"text": [ | |
"96" | |
] | |
} | |
], | |
"prompt_number": 167 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#fake push/pop\n", | |
"#refinedTweet.append({'cleanUrl': u'http://www.smashingmagazine.com/2015/01/07/designing-for-print-with-css/', 'cleanText': u'painting for print with html great walkthrough on using html to format books', 'text': u'designing for print with CSS: http://t.co/KebevCzDrZ - great walkthrough on using CSS to format books.', 'user_screen_name': u'igrigorik', 'urls': [{u'url': u'http://t.co/KebevCzDrZ', u'indices': [30, 52], u'expanded_url': u'http://bit.ly/1xX8IB8', u'display_url': u'bit.ly/1xX8IB8'}], 'id': 555151872663617537})\n", | |
"#refinedTweet.pop()\n", | |
"\n", | |
"#refinedTweet[-1]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 168 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#collections of only the 'clean' text of the tweets\n", | |
"documents = []\n", | |
"for item in refinedTweet:\n", | |
" documents.append(item['cleanText'])" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 169 | |
}, | |
{ | |
"cell_type": "raw", | |
"metadata": {}, | |
"source": [ | |
"mapping = []\n", | |
"for i,item in enumerate(refinedTweet):\n", | |
" mapping.extend([(i,item['cleanText'])])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"hr()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"####################################################################################################################################################################\n" | |
] | |
} | |
], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"<h3>Jaccard Similarity</h3>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"To find the nearly duplicate tweets" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# a shingle is a string with K-words\n", | |
"K = 4\n", | |
"\n", | |
"def jaccard_set(s1, s2):\n", | |
" u = s1.union(s2)\n", | |
" i = s1.intersection(s2)\n", | |
" return float(len(i))/float(len(u))\n", | |
"\n", | |
"def make_a_set_of_tokens(doc):\n", | |
"\n", | |
" # replace non-alphanumeric char with a space, and then split\n", | |
" tokens = re.sub(\"[^\\w]\", \" \", doc).split()\n", | |
"\n", | |
" sh = set()\n", | |
" for i in range(len(tokens)-K):\n", | |
" t = tokens[i]\n", | |
" for x in tokens[i+1:i+K]:\n", | |
" t += ' ' + x \n", | |
" sh.add(t)\n", | |
" return sh\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 171 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"shingles = []\n", | |
"for doc in documents:\n", | |
" sh = make_a_set_of_tokens(doc)\n", | |
" shingles.append(sh)\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 172 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"combinations = list( itertools.combinations([x for x in range(len(shingles))], 2) )\n", | |
"#print(\"combinations=%s\") %(combinations)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 173 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# compare each pair in combinations tuple of shingles\n", | |
"dupList = []\n", | |
"for c in combinations:\n", | |
" i1 = c[0]\n", | |
" i2 = c[1]\n", | |
" jac = jaccard_set(shingles[i1], shingles[i2])\n", | |
" if jac > 0.0:\n", | |
" #print(\"%s : %s,%s : jaccard=%s\") %(c, shingles[i1],shingles[i2],jac)\n", | |
" dupList.append(c)\n", | |
" print(\"%s : jaccard=%s\") %(c,jac)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"(9, 36) : jaccard=0.0625\n", | |
"(35, 42) : jaccard=0.0769230769231\n", | |
"(44, 75) : jaccard=0.027027027027\n", | |
"(44, 82) : jaccard=0.03125\n", | |
"(45, 61) : jaccard=0.0434782608696\n", | |
"(75, 82) : jaccard=0.0357142857143\n", | |
"(88, 95) : jaccard=0.0625\n" | |
] | |
} | |
], | |
"prompt_number": 174 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#list of the duplicate pairs\n", | |
"dupList" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 175, | |
"text": [ | |
"[(9, 36), (35, 42), (44, 75), (44, 82), (45, 61), (75, 82), (88, 95)]" | |
] | |
} | |
], | |
"prompt_number": 175 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"for i in dupList:\n", | |
" print i, '\\n--------'\n", | |
" print \"Original Tweet: \", refinedTweet[i[0]]['text'] ,'\\n\\nURL:', refinedTweet[i[0]]['cleanUrl'] ,'\\n\\nPosted By:',\\\n", | |
" refinedTweet[i[0]]['user_screen_name']\n", | |
" hr('-')\n", | |
" print \"Original Tweet: \", refinedTweet[i[1]]['text'] ,'\\n\\nURL:', refinedTweet[i[1]]['cleanUrl'] ,'\\n\\nPosted By:',\\\n", | |
" refinedTweet[i[1]]['user_screen_name']\n", | |
" hr()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"(9, 36) \n", | |
"--------\n", | |
"Original Tweet: Amazon signs Woody Allen for his first TV series, to run on Prime Instant Video http://t.co/bfDAzczvUy http://t.co/Nyqt076Rj1 \n", | |
"\n", | |
"URL: http://techcrunch.com/2015/01/13/amazon-signs-woody-allen-for-his-first-tv-series-to-run-on-prime-instant-video/?ncid=rss&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+Techcrunch+%28TechCrunch%29 \n", | |
"\n", | |
"Posted By: TechCrunch\n", | |
"----------------------------------------------------------------------------------\n", | |
"Original Tweet: Amazon signs Woody Allen to create his first TV show http://t.co/cu95uBpDmW http://t.co/W2rbbOVdjC \n", | |
"\n", | |
"URL: http://thenextweb.com/media/2015/01/13/amazon-signs-woody-allen-create-first-tv-show/ \n", | |
"\n", | |
"Posted By: TheNextWeb\n", | |
"##################################################################################\n", | |
"(35, 42) \n", | |
"--------\n", | |
"Original Tweet: TechCrunch Radio is now live! Tune in on Sirius XM 102 Indie #TCRadio http://t.co/P6kJw0WQnq \n", | |
"\n", | |
"URL: http://techcrunch.com/2015/01/13/tune-in-to-techcrunch-radio-tonight-on-sirius-xm-102-indie/?ncid=rss&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+Techcrunch+%28TechCrunch%29 \n", | |
"\n", | |
"Posted By: TechCrunch\n", | |
"----------------------------------------------------------------------------------\n", | |
"Original Tweet: TechCrunch Radio is about to go live on the air! Tune in at 6pm ET, 3pm PT on Sirius XM 102 Indie #TCRadio http://t.co/IATc8rpTHL \n", | |
"\n", | |
"URL: http://techcrunch.com/2015/01/13/tune-in-to-techcrunch-radio-tonight-on-sirius-xm-102-indie/?ncid=rss&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+Techcrunch+%28TechCrunch%29 \n", | |
"\n", | |
"Posted By: TechCrunch\n", | |
"##################################################################################\n", | |
"(44, 75) \n", | |
"--------\n", | |
"Original Tweet: RT @rjcc: Elon Musk: even if all energy for EVs was created by hydrocarbons it would be better for the environment than gas. inherently jus\u2026 \n", | |
"\n", | |
"URL: None \n", | |
"\n", | |
"Posted By: engadget\n", | |
"----------------------------------------------------------------------------------\n", | |
"Original Tweet: RT @rjcc: Elon Musk: I have an issue with punctuality (in response to q about Model X delay) says Model X will ship this summer \n", | |
"\n", | |
"URL: None \n", | |
"\n", | |
"Posted By: engadget\n", | |
"##################################################################################\n", | |
"(44, 82) \n", | |
"--------\n", | |
"Original Tweet: RT @rjcc: Elon Musk: even if all energy for EVs was created by hydrocarbons it would be better for the environment than gas. inherently jus\u2026 \n", | |
"\n", | |
"URL: None \n", | |
"\n", | |
"Posted By: engadget\n", | |
"----------------------------------------------------------------------------------\n", | |
"Original Tweet: RT @rjcc: Elon Musk actually emits a particular frequency of light that renders him immune to photography http://t.co/dzEpDxSnmT \n", | |
"\n", | |
"URL: None \n", | |
"\n", | |
"Posted By: engadget\n", | |
"##################################################################################\n", | |
"(45, 61) \n", | |
"--------\n", | |
"Original Tweet: RT @rjcc: Musk: Our challenge is not selling the Model X, it's producing it reliably, etc. No ads this year \n", | |
"\n", | |
"URL: None \n", | |
"\n", | |
"Posted By: engadget\n", | |
"----------------------------------------------------------------------------------\n", | |
"Original Tweet: RT @rjcc: Musk: Our biggest focus is adding service centers (instead of stores) http://t.co/hZE9obko5m \n", | |
"\n", | |
"URL: None \n", | |
"\n", | |
"Posted By: engadget\n", | |
"##################################################################################\n", | |
"(75, 82) \n", | |
"--------\n", | |
"Original Tweet: RT @rjcc: Elon Musk: I have an issue with punctuality (in response to q about Model X delay) says Model X will ship this summer \n", | |
"\n", | |
"URL: None \n", | |
"\n", | |
"Posted By: engadget\n", | |
"----------------------------------------------------------------------------------\n", | |
"Original Tweet: RT @rjcc: Elon Musk actually emits a particular frequency of light that renders him immune to photography http://t.co/dzEpDxSnmT \n", | |
"\n", | |
"URL: None \n", | |
"\n", | |
"Posted By: engadget\n", | |
"##################################################################################\n", | |
"(88, 95) \n", | |
"--------\n", | |
"Original Tweet: Follow us live with Elon Musk on the future of electric cars: http://t.co/v0ahYWNQ2v http://t.co/K1u8IF0Ki3 \n", | |
"\n", | |
"URL: http://live.theverge.com/elon-musk-tesla-detroit-auto-show-2015/ \n", | |
"\n", | |
"Posted By: verge\n", | |
"----------------------------------------------------------------------------------\n", | |
"Original Tweet: Stephen Hawking, Elon Musk sign open letter on the future of artificial intelligence http://t.co/q5rjgk1zeZ \n", | |
"\n", | |
"URL: http://www.fastcompany.com/3040853/scientists-caution-about-the-dangers-of-artificial-intelligence \n", | |
"\n", | |
"Posted By: FastCompany\n", | |
"##################################################################################\n" | |
] | |
} | |
], | |
"prompt_number": 176 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"<hr><center><h3>The way ahead...</h3></center>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"<ul>\n", | |
"<li>The accuracy needs to be increased \n", | |
"<li>Optimize the extraction process to speed things up\n", | |
"<li>Implement minhashing, i.e., calculate the hashes of all ngrams and then compare them to save time\n", | |
"<li>Add a redis pipeline to cache the hashes\n", | |
"<li>Explore new techniques to find the near duplicate text\n", | |
"<li>Add synonyms of words to facililate duplicate detection of different seeming words\n", | |
"<li>Add frontend, for easy navigation\n", | |
"<li>Add option to view all links of similar tweets" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment