Skip to content

Instantly share code, notes, and snippets.

@answerquest
Last active October 10, 2022 06:22
Show Gist options
  • Save answerquest/04675b6ecc155e5dcf8a5024ec00b597 to your computer and use it in GitHub Desktop.
Save answerquest/04675b6ecc155e5dcf8a5024ec00b597 to your computer and use it in GitHub Desktop.
SOI Toponyms data import, csv making
SOI Toponyms data import to postgresql DB, csv making
DROP TABLE IF EXISTS soi_toponyms;
CREATE TABLE soi_toponyms (
sr SERIAL PRIMARY KEY,
fid VARCHAR(100) NULL,
objectid VARCHAR(100) NULL,
objectid_1 VARCHAR(100) NULL,
feature VARCHAR(100) NULL,
texta VARCHAR(100) NULL,
gmrotation DECIMAL(9,3) NULL,
color SMALLINT NULL,
stylea SMALLINT NULL,
weight SMALLINT NULL,
font SMALLINT NULL,
unique_id VARCHAR(100) NULL,
svy_date VARCHAR(100) NULL,
orig_fid INT NULL,
fid_2 INT NULL,
objectid_12 INT NULL,
everest_sh VARCHAR(100) NULL,
osm_sheet_ VARCHAR(100) NULL,
updated VARCHAR(100) NULL,
restricted VARCHAR(100) NULL,
responsibi VARCHAR(100) NULL,
shape_leng DECIMAL(15,3) NULL,
fid_12 INT NULL,
feature_1 VARCHAR(100) NULL,
text_1 VARCHAR(100) NULL,
hindi VARCHAR(100) NULL,
bengali VARCHAR(100) NULL,
gujarati VARCHAR(100) NULL,
kannada VARCHAR(100) NULL,
malayalam VARCHAR(100) NULL,
marathi VARCHAR(100) NULL,
punjabi VARCHAR(100) NULL,
tamil VARCHAR(100) NULL,
telugu VARCHAR(100) NULL,
roman VARCHAR(100) NULL,
assamese VARCHAR(100) NULL,
objectid_2 VARCHAR(100) NULL,
bodo VARCHAR(100) NULL,
dogri VARCHAR(100) NULL,
konkani VARCHAR(100) NULL,
kashmiri VARCHAR(100) NULL,
maithili VARCHAR(100) NULL,
manipuri VARCHAR(100) NULL,
nepali VARCHAR(100) NULL,
oriya VARCHAR(100) NULL,
sindhi VARCHAR(100) NULL,
sanskrit VARCHAR(100) NULL,
santhali VARCHAR(100) NULL,
urdu VARCHAR(100) NULL,
geometry GEOMETRY(POINT,4326) NULL
);
CREATE INDEX soi_toponyms_geom_1 ON soi_toponyms USING GIST (geometry);
2022-10-09 14:57:00 : soi_toponyms table truncated and serial num reset
2022-10-09 14:57:00 : Removing pre-existing toponyms/soi_toponyms.csv
2022-10-09 14:57:01 : Batch size: 100000
2022-10-09 14:58:06 : 1 batches sent
2022-10-09 14:58:17 : Annoyingly huge field, zapping all spaces: text = 'D O D D A P P A N Ā Y A K K A N Ū R S O U T H R F'
2022-10-09 14:58:17 : Annoyingly huge field, zapping all spaces: text_1 = 'D O D D A P P A N Ā Y A K K A N Ū R S O U T H R F'
2022-10-09 14:58:17 : Annoyingly huge field, zapping all spaces: roman = 'D O D D A P P A N A Y A K K A N U R S O U T H R F'
2022-10-09 14:58:18 : Annoyingly huge field, zapping all spaces: text = 'S I R U M A L A I R E S E R V E D F O R E S T'
2022-10-09 14:58:18 : Annoyingly huge field, zapping all spaces: text_1 = 'S I R U M A L A I R E S E R V E D F O R E S T'
2022-10-09 14:58:18 : Annoyingly huge field, zapping all spaces: roman = 'S I R U M A L A I R E S E R V E D F O R E S T'
2022-10-09 14:58:21 : Annoyingly huge field, zapping all spaces: text = 'H U L I K A L D U R G A M R E S E R V E D F O R E S T'
2022-10-09 14:58:21 : Annoyingly huge field, zapping all spaces: text_1 = 'H U L I K A L D U R G A M R E S E R V E D F O R E S T'
2022-10-09 14:58:21 : Annoyingly huge field, zapping all spaces: roman = 'H U L I K A L D U R G A M R E S E R V E D F O R E S T'
2022-10-09 14:58:24 : Annoyingly huge field, zapping all spaces: text = 'C H I L A N D V Ā D I S T A T E F O R E S T'
2022-10-09 14:58:24 : Annoyingly huge field, zapping all spaces: text_1 = 'C H I L A N D V Ā D I S T A T E F O R E S T'
2022-10-09 14:58:24 : Annoyingly huge field, zapping all spaces: roman = 'C H I L A N D V A D I S T A T E F O R E S T'
2022-10-09 14:59:08 : 2 batches sent
2022-10-09 14:59:12 : Annoyingly huge field, zapping all spaces: text = 'K A R V E T N A G A R P R O T E C T E D F O R E S T'
2022-10-09 14:59:12 : Annoyingly huge field, zapping all spaces: text_1 = 'K A R V E T N A G A R P R O T E C T E D F O R E S T'
2022-10-09 14:59:12 : Annoyingly huge field, zapping all spaces: roman = 'K A R V E T N A G A R P R O T E C T E D F O R E S T'
2022-10-09 15:00:08 : 3 batches sent
2022-10-09 15:01:22 : 4 batches sent
2022-10-09 15:02:23 : 5 batches sent
2022-10-09 15:02:31 : Annoyingly huge field, zapping all spaces: text = 'I D U P U L A P Ā Y A & R E D L A C H E R U V U R E S E R V E D F O R E S T'
2022-10-09 15:02:31 : Annoyingly huge field, zapping all spaces: text_1 = 'I D U P U L A P Ā Y A & R E D L A C H E R U V U R E S E R V E D F O R E S T'
2022-10-09 15:02:31 : Annoyingly huge field, zapping all spaces: roman = 'I D U P U L A P A Y A & R E D L A C H E R U V U R E S E R V E D F O R E S T'
2022-10-09 15:02:32 : Annoyingly huge field, zapping all spaces: text = 'P Ā L K O N D A & V A N G A M A L L A R E S E R V E D F O R E S T'
2022-10-09 15:02:32 : Annoyingly huge field, zapping all spaces: text_1 = 'P Ā L K O N D A & V A N G A M A L L A R E S E R V E D F O R E S T'
2022-10-09 15:02:32 : Annoyingly huge field, zapping all spaces: roman = 'P A L K O N D A & V A N G A M A L L A R E S E R V E D F O R E S T'
2022-10-09 15:03:30 : 6 batches sent
2022-10-09 15:04:33 : 7 batches sent
2022-10-09 15:05:35 : 8 batches sent
2022-10-09 15:05:40 : Annoyingly huge field, zapping all spaces: text = 'R Ā M N A G A R P R O T E C T E D F O R E S T'
2022-10-09 15:05:40 : Annoyingly huge field, zapping all spaces: text_1 = 'R Ā M N A G A R P R O T E C T E D F O R E S T'
2022-10-09 15:05:40 : Annoyingly huge field, zapping all spaces: roman = 'R A M N A G A R P R O T E C T E D F O R E S T'
2022-10-09 15:06:36 : 9 batches sent
2022-10-09 15:07:36 : 10 batches sent
2022-10-09 15:08:38 : 11 batches sent
2022-10-09 15:09:40 : 12 batches sent
2022-10-09 15:09:54 : Annoyingly huge field, zapping all spaces: text = 'K O T T U R U B A T T i L i R E S E R V E D F O R E S T'
2022-10-09 15:09:54 : Annoyingly huge field, zapping all spaces: roman = 'K O T T U R U B A T T i L i R E S E R V E D F O R E S T'
2022-10-09 15:10:33 : 13 batches sent
2022-10-09 15:10:33 : Toponyms import completed.
# soi_toponyms_import.py
# 2022-10-09 by Nikhil VJ, https://nikhilvj.co.in
# source data: .geojonl from https://www.kaggle.com/datasets/planemad/soi-india-map-toponyms/versions/3
# Importing to a Postgresql DB AND outputting to a CSV (v2), with bad chars/lines fixed
# to do: read in the geojsonl,
# do find-replace of chars,
# ensure there are no \ slashes in each field
# export to CSV AND to DB
dataFile = 'toponyms/soi-osm-toponyms.geojsonl.json'
batchSize = 100000
skip2Batch = 0
rename_cols = {"OBJECTID": "objectid", "text":"texta", "style":"stylea"}
columns_order = ['latitude','longitude'] # initiate base list of cols
csvFile = 'toponyms/soi_toponyms.csv'
from sqlalchemy import create_engine
import pandas as pd
import geopandas as gpd
import datetime, os, json, io, sys
from itertools import islice # https://stackoverflow.com/a/6335876/4355695
def logmessage( *content ):
timeOffset = 5.5
timestamp = '{:%Y-%m-%d %H:%M:%S} :'.format(datetime.datetime.utcnow() + datetime.timedelta(hours=timeOffset)) # from https://stackoverflow.com/a/26455617/4355695
line = ' '.join(str(x) for x in list(content)) # from https://stackoverflow.com/a/3590168/4355695
print(timestamp, line) # print to screen also
logFilename = 'log.txt'
with open(logFilename, 'a') as f:
print(timestamp, line, file=f) # file=f argument at end writes to file. from https://stackoverflow.com/a/2918367/4355695
def makegpd(x,lat='latitude',lon='longitude'):
gdf = gpd.GeoDataFrame(x, geometry=gpd.points_from_xy(x[lon],x[lat]), crs="EPSG:4326")
gdf.drop(columns=[lat,lon], inplace=True)
return gdf
#########
creds = json.load(open('local_dbcreds.json','r'))
engine = create_engine(f"postgresql://{creds['DB_USER']}:{creds['DB_PW']}@{creds['DB_SERVER']}:{creds['DB_PORT']}/{creds['DB_DBNAME']}")
########
d1 = "TRUNCATE TABLE soi_toponyms RESTART IDENTITY"
c = engine.connect()
res = c.execute(d1)
c.close()
logmessage("soi_toponyms table truncated and serial num reset")
if os.path.isfile(csvFile):
logmessage(f"Removing pre-existing {csvFile}")
os.remove(csvFile)
########
logmessage(f"Batch size: {batchSize}")
batchCounter = 0
# Reading batches of lines, from https://stackoverflow.com/a/6335876/4355695
with open(dataFile,'r') as f:
while True:
next_n_lines = list(islice(f, batchSize))
if not next_n_lines:
break
if (skip2Batch-1) > batchCounter:
batchCounter += 1
logmessage(f"skipping batch {batchCounter}")
continue
collector = []
for g in next_n_lines:
try:
h = json.loads(g)
except:
print(g)
raise
props = h.get('properties',{})
row = {}
for key in props.keys():
value = props[key]
# do char replacement
if isinstance(value,str):
if len(value) > 0:
value2 = value.replace('\n','īn').replace('\r','īr').replace('>','Ā').replace('<','ā').replace('|','Ī').replace('@','Ū').replace('#','ū').replace('\\','ī').replace(' '*5,' ').strip()
# also zapping silly spaces
# nuclear option for annoying spaces
if len(value2) > 100:
logmessage(f"Annoyingly huge field, zapping all spaces: {key} = '{value2}'")
value2 = value2.replace(' ','')
if len(value2) > 100:
value2 = value2[:100]
logmessage(f"Still huge after zapping spaces also, truncating it. Final: {key} = '{value2}'")
# if value2 != value:
# print(f"value: {value} changed to: {value2}")
else:
value2 = ''
else:
value2 = value
if key in rename_cols.keys():
row[rename_cols[key]] = value2
else:
row[key] = value2
ll = h.get('geometry',{}).get('coordinates',[])
row['longitude'] = ll[0]
row['latitude'] = ll[1]
collector.append(row)
df1 = pd.DataFrame(collector)
# data adjustment:
def rounding1(x):
try:
x = float(x)
return round(x,3)
except:
return x
df1['gmrotation'] = df1['gmrotation'].apply(rounding1)
df1['shape_leng'] = df1['shape_leng'].apply(rounding1)
# export to csv
if not os.path.isfile(csvFile):
# logmessage("writing new file")
df1.to_csv(csvFile, index=False)
columns_order = df1.columns.tolist()
else:
# logmessage("Appending to file")
df1[columns_order].to_csv(csvFile, index=False, header=False, mode='a')
gdf1 = makegpd(df1)
try:
gdf1.to_postgis('soi_toponyms', engine, if_exists='append', index=False)
except Exception as e:
print(f"Errored out at batch {batchCounter-1}, saving to faulty.csv raw text to faulty.txt")
df1.to_csv('toponyms/faulty.csv',index=False)
with open('toponyms/faulty.txt','w') as f:
f.write(''.join(next_n_lines))
print('\n\n')
logmessage(e)
sys.exit()
batchCounter += 1
logmessage(f"{batchCounter} batches sent")
logmessage(f"Toponyms import completed.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment