Created
March 13, 2022 05:27
-
-
Save thisismattmiller/9bde51b98983bad0d4d738eed0a4d82f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import json | |
urls = {} | |
for file in glob.glob('data_sogb/*'): | |
with open(file) as inf: | |
for line in inf: | |
j = json.loads('{' + line.split('{')[1]) | |
url = j['url'].replace('http://','').replace('https://','').split('/')[0] | |
urls[url] = True | |
print(json.dumps(list(urls.keys()),indent=2)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
common_crawls = ["CC-MAIN-2022-05","CC-MAIN-2021-49","CC-MAIN-2021-43","CC-MAIN-2021-39","CC-MAIN-2021-31","CC-MAIN-2021-25","CC-MAIN-2021-21","CC-MAIN-2021-17","CC-MAIN-2021-10","CC-MAIN-2021-04","CC-MAIN-2020-50","CC-MAIN-2020-45","CC-MAIN-2020-40","CC-MAIN-2020-34","CC-MAIN-2020-29","CC-MAIN-2020-24","CC-MAIN-2020-16","CC-MAIN-2020-10","CC-MAIN-2020-05","CC-MAIN-2019-51","CC-MAIN-2019-47","CC-MAIN-2019-43","CC-MAIN-2019-39","CC-MAIN-2019-35","CC-MAIN-2019-30","CC-MAIN-2019-26","CC-MAIN-2019-22","CC-MAIN-2019-18","CC-MAIN-2019-13","CC-MAIN-2019-09","CC-MAIN-2019-04","CC-MAIN-2018-51","CC-MAIN-2018-47","CC-MAIN-2018-43","CC-MAIN-2018-39","CC-MAIN-2018-34","CC-MAIN-2018-30","CC-MAIN-2018-26","CC-MAIN-2018-22","CC-MAIN-2018-17","CC-MAIN-2018-13","CC-MAIN-2018-09","CC-MAIN-2018-05","CC-MAIN-2017-51","CC-MAIN-2017-47","CC-MAIN-2017-43","CC-MAIN-2017-39","CC-MAIN-2017-34","CC-MAIN-2017-30","CC-MAIN-2017-26","CC-MAIN-2017-22","CC-MAIN-2017-17","CC-MAIN-2017-13","CC-MAIN-2017-09","CC-MAIN-2017-04","CC-MAIN-2016-50","CC-MAIN-2016-44","CC-MAIN-2016-40","CC-MAIN-2016-36","CC-MAIN-2016-30","CC-MAIN-2016-26","CC-MAIN-2016-22","CC-MAIN-2016-18","CC-MAIN-2016-07","CC-MAIN-2015-48","CC-MAIN-2015-40","CC-MAIN-2015-35","CC-MAIN-2015-32","CC-MAIN-2015-27","CC-MAIN-2015-22","CC-MAIN-2015-18","CC-MAIN-2015-14","CC-MAIN-2015-11","CC-MAIN-2015-06","CC-MAIN-2014-52","CC-MAIN-2014-49","CC-MAIN-2014-42","CC-MAIN-2014-41","CC-MAIN-2014-35","CC-MAIN-2014-23","CC-MAIN-2014-15","CC-MAIN-2014-10","CC-MAIN-2013-48","CC-MAIN-2013-20"] | |
for cc in common_crawls: | |
url = f"http://index.commoncrawl.org/{cc}-index?url=*.signourguestbook.com" | |
req = requests.get(url) | |
print(url) | |
if req.status_code == 200: | |
with open(f"data_sogb/{cc}",'w') as outf: | |
outf.write(req.text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[ | |
"1cda.signourguestbook.com", | |
"31act6668.signourguestbook.com", | |
"78thinfantrydiv.signourguestbook.com", | |
"abbeyjim.signourguestbook.com", | |
"abrev.signourguestbook.com", | |
"alrdoc.signourguestbook.com", | |
"angelwispa.signourguestbook.com", | |
"arrowoodbrian.signourguestbook.com", | |
"august.signourguestbook.com", | |
"authorcelia.signourguestbook.com", | |
"authoritarians.signourguestbook.com", | |
"authorkeqm.signourguestbook.com", | |
"avhsalumni.signourguestbook.com", | |
"bravedork.signourguestbook.com", | |
"bronze4u.signourguestbook.com", | |
"browngap.signourguestbook.com", | |
"bryanel.signourguestbook.com", | |
"catwholaughed.signourguestbook.com", | |
"chocal8kiss.signourguestbook.com", | |
"chrishill.signourguestbook.com", | |
"cigarmanandy.signourguestbook.com", | |
"cottonbalers.signourguestbook.com", | |
"cromer.signourguestbook.com", | |
"cubit99.signourguestbook.com", | |
"cybersulat.signourguestbook.com", | |
"deadmiledance.signourguestbook.com", | |
"deg.signourguestbook.com", | |
"dejavu48.signourguestbook.com", | |
"deyakusuma.signourguestbook.com", | |
"diocesesd.signourguestbook.com", | |
"diypoll.signourguestbook.com", | |
"djjayito.signourguestbook.com", | |
"dmc.signourguestbook.com", | |
"dodygood.signourguestbook.com", | |
"dragoni.signourguestbook.com", | |
"edinburg.signourguestbook.com", | |
"ellenmeister.signourguestbook.com", | |
"etnoyen.signourguestbook.com", | |
"fgfservices.signourguestbook.com", | |
"flipper0828.signourguestbook.com", | |
"flipper082859.signourguestbook.com", | |
"flipper828.signourguestbook.com", | |
"founder.signourguestbook.com", | |
"frankie66.signourguestbook.com", | |
"frobert.signourguestbook.com", | |
"froogle.signourguestbook.com", | |
"fujiprofessional.signourguestbook.com", | |
"funn2009.signourguestbook.com", | |
"gbstalag.signourguestbook.com", | |
"gearle123.signourguestbook.com", | |
"genejones.signourguestbook.com", | |
"gilgerarddotcom.signourguestbook.com", | |
"gmo2010.signourguestbook.com", | |
"goatlocker.signourguestbook.com", | |
"gr8danelover.signourguestbook.com", | |
"gregoryabbott.signourguestbook.com", | |
"gwmcrae.signourguestbook.com", | |
"www.hellsangelsberdoo.signourguestbook.com", | |
"homeagainfarm.signourguestbook.com", | |
"hsjeguestbook.signourguestbook.com", | |
"ifp.signourguestbook.com", | |
"iranianfootballpage.signourguestbook.com", | |
"iuecone.signourguestbook.com", | |
"janapood.signourguestbook.com", | |
"jeremycallaghanfansite.signourguestbook.com", | |
"jimjammer1.signourguestbook.com", | |
"jiyushinkai.signourguestbook.com", | |
"johnmorganhappyhour.signourguestbook.com", | |
"judygarland7.signourguestbook.com", | |
"k6ge.signourguestbook.com", | |
"kathytemean.signourguestbook.com", | |
"khesanhvet.signourguestbook.com", | |
"legends.signourguestbook.com", | |
"limahl.signourguestbook.com", | |
"linedancer.signourguestbook.com", | |
"louisetaylor.signourguestbook.com", | |
"lukabal.signourguestbook.com", | |
"majesticworld.signourguestbook.com", | |
"marilynsorensen.signourguestbook.com", | |
"mdrumm.signourguestbook.com", | |
"middlegeorgiaparanormal.signourguestbook.com", | |
"mysteryfeet.signourguestbook.com", | |
"nashwaaksis.signourguestbook.com", | |
"neatoday-danvers.signourguestbook.com", | |
"neatoday-reno.signourguestbook.com", | |
"neshkov.signourguestbook.com", | |
"netwish.signourguestbook.com", | |
"ngravley.signourguestbook.com", | |
"nicko62.signourguestbook.com", | |
"nverona.signourguestbook.com", | |
"onepeople.signourguestbook.com", | |
"onsight.signourguestbook.com", | |
"pannudds.signourguestbook.com", | |
"patjamesguestbook.signourguestbook.com", | |
"rheidt.signourguestbook.com", | |
"richards.signourguestbook.com", | |
"rorocny.signourguestbook.com", | |
"rprather.signourguestbook.com", | |
"ruralwillys.signourguestbook.com", | |
"scbbbc.signourguestbook.com", | |
"segitseganglia.signourguestbook.com", | |
"selectee.signourguestbook.com", | |
"shahid74.signourguestbook.com", | |
"spiderlakeretreat.signourguestbook.com", | |
"strwynd.signourguestbook.com", | |
"sweetblues.signourguestbook.com", | |
"teahouse.signourguestbook.com", | |
"thmch.signourguestbook.com", | |
"tigerforcerecon.signourguestbook.com", | |
"tsrl.signourguestbook.com", | |
"undertheblades.signourguestbook.com", | |
"usscanopus.signourguestbook.com", | |
"ussfranklindroosevelt.signourguestbook.com", | |
"usshollandas32.signourguestbook.com", | |
"ussstr.signourguestbook.com", | |
"vavau.signourguestbook.com", | |
"vines4u.signourguestbook.com", | |
"w5dxs.signourguestbook.com", | |
"webmasteratbigt.signourguestbook.com", | |
"wellofstars.signourguestbook.com", | |
"westburyfd.signourguestbook.com", | |
"westmauivacation.signourguestbook.com", | |
"yerusha.signourguestbook.com", | |
"signourguestbook.com", | |
"a2p.signourguestbook.com", | |
"aaronforever.signourguestbook.com", | |
"agenesiscorpuscallosum.signourguestbook.com", | |
"akphoto7.signourguestbook.com", | |
"auban.signourguestbook.com", | |
"biggles.signourguestbook.com", | |
"blackpoolweddings.signourguestbook.com", | |
"bobkunnel.signourguestbook.com", | |
"cavaliers.signourguestbook.com", | |
"classicalmysterytour.signourguestbook.com", | |
"cottages.signourguestbook.com", | |
"crandr.signourguestbook.com", | |
"davidlharrison.signourguestbook.com", | |
"dominique.signourguestbook.com", | |
"dorgalli.signourguestbook.com", | |
"dorgalli2.signourguestbook.com", | |
"e2dennis.signourguestbook.com", | |
"earles.signourguestbook.com", | |
"exlancs.signourguestbook.com", | |
"fgfprojects.signourguestbook.com", | |
"hotclub.signourguestbook.com", | |
"ileategbe.signourguestbook.com", | |
"jansmurph.signourguestbook.com", | |
"jarhead9962.signourguestbook.com", | |
"jarhead9962-2.signourguestbook.com", | |
"jessicahaffer.signourguestbook.com", | |
"jhatch.signourguestbook.com", | |
"jlgage01.signourguestbook.com", | |
"journeyguestbook.signourguestbook.com", | |
"judyarnold.signourguestbook.com", | |
"keeney.signourguestbook.com", | |
"kiaorana.signourguestbook.com", | |
"klaxtonbrown.signourguestbook.com", | |
"larryebailey.signourguestbook.com", | |
"luishiggins.signourguestbook.com", | |
"minty.signourguestbook.com", | |
"mk74scott.signourguestbook.com", | |
"ngsir.signourguestbook.com", | |
"nickmessinger.signourguestbook.com", | |
"patj5338.signourguestbook.com", | |
"president.signourguestbook.com", | |
"pwhsa.signourguestbook.com", | |
"qstogether.signourguestbook.com", | |
"rietomosaka.signourguestbook.com", | |
"rinksgal.signourguestbook.com", | |
"seinebight.signourguestbook.com", | |
"shadyacresfl.signourguestbook.com", | |
"skkky.signourguestbook.com", | |
"sobs.signourguestbook.com", | |
"southerncpafirm.signourguestbook.com", | |
"southernsteel.signourguestbook.com", | |
"speculumgregis.signourguestbook.com", | |
"sweetwaterbranch.signourguestbook.com", | |
"tghhampton.signourguestbook.com", | |
"thomaswikman.signourguestbook.com", | |
"truthministries.signourguestbook.com", | |
"ukzorro.signourguestbook.com", | |
"6987th.signourguestbook.com", | |
"brasseauxskennel.signourguestbook.com", | |
"dccc.signourguestbook.com", | |
"denicefranke.signourguestbook.com", | |
"dkranig.signourguestbook.com", | |
"elanamusic.signourguestbook.com", | |
"fairwayclay.signourguestbook.com", | |
"gtisc.signourguestbook.com", | |
"junglerosebeauty.signourguestbook.com", | |
"marywel.signourguestbook.com", | |
"mrp.signourguestbook.com", | |
"oliviagracearmand.signourguestbook.com", | |
"oshws.signourguestbook.com", | |
"raff.signourguestbook.com", | |
"robinson.signourguestbook.com", | |
"twest1117.signourguestbook.com", | |
"christiansurvivors.signourguestbook.com", | |
"cvt.signourguestbook.com", | |
"demonknightsmc.signourguestbook.com", | |
"dmccunn.signourguestbook.com", | |
"econ.signourguestbook.com", | |
"feedback.signourguestbook.com", | |
"fll.signourguestbook.com", | |
"heirloomjewelryandcoins.signourguestbook.com", | |
"highla12.signourguestbook.com", | |
"jerrywmcdaniel.signourguestbook.com", | |
"jmaclean.signourguestbook.com", | |
"km811.signourguestbook.com", | |
"lburg.signourguestbook.com", | |
"plantters.signourguestbook.com", | |
"renobailey.signourguestbook.com", | |
"steveweed.signourguestbook.com", | |
"stirl1.signourguestbook.com", | |
"tymarshal.signourguestbook.com", | |
"whsa.signourguestbook.com", | |
"mfwc.signourguestbook.com", | |
"pioneers.signourguestbook.com", | |
"roughneckcity.signourguestbook.com", | |
"aceofcups.signourguestbook.com", | |
"anthonybooty.signourguestbook.com", | |
"campnaire.signourguestbook.com", | |
"carolann.signourguestbook.com", | |
"cvmassage.signourguestbook.com", | |
"epmcentral.signourguestbook.com", | |
"haysjj.signourguestbook.com", | |
"ife.signourguestbook.com", | |
"kaeo.signourguestbook.com", | |
"kathih.signourguestbook.com", | |
"malites.signourguestbook.com", | |
"margaritabiz.signourguestbook.com", | |
"microflyers.signourguestbook.com", | |
"myjeblog.signourguestbook.com", | |
"rivhunter.signourguestbook.com", | |
"rosary.signourguestbook.com", | |
"sandsoyveret.signourguestbook.com", | |
"usstroutss566.signourguestbook.com", | |
"amazinglyawesome.signourguestbook.com", | |
"hoodrelatedentertainment.signourguestbook.com", | |
"sgforums.signourguestbook.com", | |
"smile.signourguestbook.com", | |
"yellowstone.signourguestbook.com", | |
"yourportchestersnapshot.signourguestbook.com", | |
"chayzlounge.signourguestbook.com", | |
"cjohnson.signourguestbook.com", | |
"pcdl.signourguestbook.com", | |
"rthompson.signourguestbook.com", | |
"waymartlodge.signourguestbook.com", | |
"ladyphenie.signourguestbook.com", | |
"hermajesty.signourguestbook.com", | |
"otkws1.signourguestbook.com", | |
"stmarkshighinfo.signourguestbook.com", | |
"vuguest.signourguestbook.com", | |
"wimpolepast.signourguestbook.com", | |
"rakshi.signourguestbook.com", | |
"watzegtdebijbel.signourguestbook.com", | |
"wimbledonosteopathy.signourguestbook.com", | |
"320thwebmaster.signourguestbook.com", | |
"joozis.signourguestbook.com", | |
"jwhc.signourguestbook.com", | |
"customcakes.signourguestbook.com", | |
"newhopepennsylvania.signourguestbook.com", | |
"txpride.signourguestbook.com", | |
"grannyrocks.signourguestbook.com", | |
"nekmar.signourguestbook.com", | |
"tonytdm.signourguestbook.com", | |
"usvisamex.signourguestbook.com", | |
"bcomfetish.signourguestbook.com", | |
"ginnydeer.signourguestbook.com", | |
"girulinda.signourguestbook.com", | |
"jay901rana.signourguestbook.com", | |
"judel.signourguestbook.com", | |
"madjingaye.signourguestbook.com", | |
"marine-staff-sergeant.signourguestbook.com", | |
"pypes.signourguestbook.com", | |
"tellurideairport.signourguestbook.com", | |
"woolybear.signourguestbook.com", | |
"mamaslittlecatering.signourguestbook.com", | |
"starfish.signourguestbook.com", | |
"ngsinlin.signourguestbook.com", | |
"hleaglesnewssite.signourguestbook.com", | |
"kathykathy.signourguestbook.com", | |
"leahsjourney.signourguestbook.com", | |
"sorsogonunited.signourguestbook.com", | |
"vvmc.signourguestbook.com", | |
"padrecataag.signourguestbook.com", | |
"sagebrush-cantina.signourguestbook.com", | |
"donscottcourt.signourguestbook.com", | |
"dvc.signourguestbook.com", | |
"travelingjournalis.signourguestbook.com", | |
"standby5.signourguestbook.com", | |
"sedonakat.signourguestbook.com", | |
"masticmaster.signourguestbook.com", | |
"sauceboss.signourguestbook.com", | |
"soc63.signourguestbook.com", | |
"desipandit.signourguestbook.com", | |
"hethaifa.signourguestbook.com", | |
"sola24.signourguestbook.com" | |
] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment