Skip to content

Instantly share code, notes, and snippets.

@oleg-agapov
Last active October 14, 2019 18:53
Show Gist options
  • Save oleg-agapov/803f035aeeaeeee26eb74401b31dec7d to your computer and use it in GitHub Desktop.
Save oleg-agapov/803f035aeeaeeee26eb74401b31dec7d to your computer and use it in GitHub Desktop.
# df_raw - input dataset
def parse_and_clean(data_frame: pd.DataFrame) -> pd.DataFrame:
# parse json
df = data_frame.join(data_frame["user_json"].apply(json.loads).apply(pd.Series))
df["user_json"] = df["user_json"].apply(lambda x: x.replace('\n',''))
# explode visits
df2 = pd.DataFrame({
"uid": df.uid.repeat(df.visits.str.len()),
"sites" : np.concatenate(df.visits.values)}
).reset_index()
# split columns
df3 = pd.DataFrame([md for md in df2.sites])
df3["uid"] = df2.uid
df3["url"] = df3["url"].apply(lambda x: x.replace("\n", "")).apply(lambda x: x.replace("\r", ""))
# join initial DF
df = pd.merge(df3, df, how="left", on="uid")[["gender", "age", "uid", "url", "timestamp"]]
df["domain"] = df.url.apply(url_to_domain)
df = df.dropna()
# %timeit pd.to_datetime(df.timestamp.head(), unit="ms")
df.timestamp = pd.to_datetime(df.timestamp, unit="ms")
return df
def url_to_domain(url: str) -> str:
url = re.sub('(http(s)*://)+', 'http://', url)
parsed_url = urlparse(unquote(url.strip()))
if parsed_url.scheme not in ['http','https']: return None
netloc = re.search("(?:www\.)?(.*)", parsed_url.netloc).group(1)
if netloc is not None: return str(netloc).strip().encode('utf-8').decode('utf-8')
return None
def get_domains(input_json):
visits = json.loads(input_json)["visits"]
domains = []
for visit in visits:
domains.append(url_to_domain(visit["url"]))
return domains
def get_visits_by_hour(input_json):
visits = json.loads(input_json)["visits"]
visits_vector = []
for h in range(24):
visits_vector.append(0)
for visit in visits:
visit_hour = datetime.datetime.fromtimestamp(int(visit["timestamp"])/1000).hour
visits_vector[visit_hour] += 1
return visits_vector
train = df_raw.iloc[:100].copy()
train["hour"] = train.user_json.apply(get_visits_by_hour)
train["domains"] = train.user_json.apply(get_domains).apply(' '.join)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment