Created
March 13, 2023 20:35
-
-
Save JettScythe/5959b5b159386cfb330cf62b4cbda842 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from phonenumbers import geocoder, carrier, number_type, parse, is_valid_number | |
import argparse | |
import json | |
import csv | |
from phone_gen import PhoneNumber | |
from rapidfuzz import process, fuzz | |
type_map = {0: "Fixed Line", 1: "Mobile", 2: "Fixed Line or Mobile", 3: "Toll Free", 4: "Premium Rate", | |
5: "Shared Cost", 6: "VOIP", 7: "Personal Number", 8: "Pager", | |
9: "Universal Access Number / Company Number", 10: "Voicemail", 99: "Unknown"} | |
parser = argparse.ArgumentParser(description='Generate phone numbers for a country and save them to a CSV file.') | |
parser.add_argument('country_code', type=str, help='The country code for the phone numbers to be generated. eg: US, ' | |
'CA, IN') | |
parser.add_argument('num_of_phones', type=int, help='The amount of numbers you would like to generate') | |
args = parser.parse_args() | |
with open('pincodes.json', 'r') as pincode_file: | |
pincode_map = json.load(pincode_file) | |
with open('pincode_district_map.json', 'r') as pincode_districts_file: | |
pincode_district_map = json.load(pincode_districts_file) | |
def create_final_dict(exact_matches): | |
final_dict = {} | |
for exact_match in exact_matches: | |
final_dict[exact_match["Level"].lower()] = exact_match["Name"] | |
if exact_match["TRU"] == "Rural": | |
final_dict[f"{exact_match['Name']}_rural_pop"] = exact_match["TOT_P"] | |
if exact_match["TRU"] == "Urban": | |
final_dict[f"{exact_match['Name']}_urban_pop"] = exact_match["TOT_P"] | |
if exact_match["TRU"] == "Total": | |
final_dict[f"{exact_match['Name']}_total_pop"] = exact_match["TOT_P"] | |
return final_dict | |
def get_needed_maps_data(assumed_district: str, search_term: str): | |
with open("merged.csv") as maps_data: | |
reader = csv.DictReader(maps_data) | |
best_matches = [] | |
exact_city_matches = [] | |
exact_subdistrict_matches = [] | |
exact_district_matches = [] | |
for row in reader: | |
# find the row where district name matches | |
if row["Level"] == "DISTRICT" and row["Name"].lower() == assumed_district: | |
# find most similar name until row["Level"] == "DISTRICT" again | |
while True: | |
try: | |
next_row = next(reader) | |
except StopIteration: | |
break | |
if next_row["Name"].lower() == search_term.lower() and next_row["Level"] != "DISTRICT": | |
exact_city_matches.append(next_row) | |
elif next_row["Name"].lower() == assumed_district and next_row["Level"] == "DISTRICT": | |
exact_district_matches.append(next_row) | |
elif next_row["Name"].lower() == assumed_district and next_row["Level"] == "SUB-DISTRICT": | |
exact_subdistrict_matches.append(next_row) | |
else: | |
name_similarity = fuzz.token_sort_ratio(search_term, next_row["Name"]) | |
if not best_matches or name_similarity > best_matches[0]["name_similarity"]: | |
best_matches = [{"row": next_row, "name_similarity": name_similarity}] | |
elif name_similarity == best_matches[0]["name_similarity"]: | |
best_matches.append({"row": next_row, "name_similarity": name_similarity}) | |
merged_list = exact_city_matches + exact_district_matches + exact_subdistrict_matches | |
return create_final_dict(merged_list) | |
def generate_mobile_numbers(): | |
unique_phone_numbers = set() | |
while len(unique_phone_numbers) < args.num_of_phones: | |
phone_number = PhoneNumber(args.country_code).get_number() | |
if is_valid_number(parse(phone_number)): | |
unique_phone_numbers.add(phone_number) | |
return unique_phone_numbers | |
with open(f'phone_numbers_{args.country_code}_{args.num_of_phones}.csv', | |
mode='w') as file: | |
writer = csv.writer(file) | |
writer.writerow( | |
['Phone Number', 'Type', 'Parsed Carrier', 'Parsed City', | |
'Parsed Region', 'Pincode', 'District From Pincode', 'Sub-District From Pincode', | |
'Village From Pincode', 'Village Total Population', 'Village Rural Population', 'Village Urban Population', | |
'Sub-District Total Population', 'Sub-District Rural Population', 'Sub-District Urban Population', | |
'District Total Population', 'District Rural Population', 'District Urban Population'] | |
) | |
for phone_num in generate_mobile_numbers(): | |
parsed_phone_number = parse(phone_num) | |
parsed_region = geocoder.description_for_number(parsed_phone_number, "en") | |
parsed_city = "" | |
pincode = "" | |
if "," in parsed_region: | |
parsed_city, parsed_region = parsed_region.split(", ") | |
choices = pincode_map.keys() | |
if parsed_city in choices: | |
pincode = pincode_map[parsed_city] | |
for row in pincode_district_map: | |
if row.get("Pincode") == int(pincode): | |
district_name = row["Districtname"].lower() | |
mapped_data_row = get_needed_maps_data(district_name, parsed_city) | |
else: | |
results = process.extractOne(parsed_city, choices, scorer=fuzz.WRatio) | |
assumed_city = results[0] | |
pincode = pincode_map[assumed_city] | |
for row in pincode_district_map: | |
if row.get("Pincode") == int(pincode): | |
district_name = row["Districtname"].lower() | |
mapped_data_row = get_needed_maps_data(district_name, assumed_city) | |
parsed_carrier = carrier.name_for_number(parsed_phone_number, "en", region=args.country_code) | |
district = mapped_data_row.get("district") | |
subdistrict = mapped_data_row.get("sub-district") | |
village = mapped_data_row.get("village") | |
phone_number_type = type_map[number_type(parsed_phone_number)] | |
if parsed_region == "India": | |
row = [phone_num, phone_number_type, parsed_carrier, parsed_city, parsed_region, pincode, "", "", "", "", | |
"", "", "", "", "", 1416459205, 909384771, 498179071] | |
else: | |
row = [phone_num, phone_number_type, parsed_carrier, parsed_city, | |
parsed_region, pincode, district, subdistrict, village, mapped_data_row.get(f"{village}_total_pop"), | |
mapped_data_row.get(f"{village}_rural_pop"), mapped_data_row.get(f"{village}_urban_pop"), | |
mapped_data_row.get(f"{subdistrict}_total_pop"), mapped_data_row.get(f"{subdistrict}_rural_pop"), | |
mapped_data_row.get(f"{subdistrict}_urban_pop"), mapped_data_row.get(f"{district}_total_pop"), | |
mapped_data_row.get(f"{district}_rural_pop"), mapped_data_row.get(f"{district}_urban_pop")] | |
writer.writerow(row) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment