I hereby claim:
- I am danielfrg on github.
- I am danielfrg (https://keybase.io/danielfrg) on keybase.
- I have a public key ASDYKve9COIyFov3ozEHC6eHuRZFZqPQq8b1ezthy4hNVgo
To claim this, I am signing this object:
#!/usr/bin/env python | |
# Copyright 2018 Google LLC | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# https://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software |
<!DOCTYPE html> | |
<html> | |
<head><meta charset="utf-8" /> | |
<title>matplotlib</title><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.1.10/require.min.js"></script><link rel="stylesheet" href="https://unpkg.com/font-awesome@4.7.0/css/font-awesome.min.css" type="text/css" /> | |
I hereby claim:
To claim this, I am signing this object:
import pandas as pd | |
_input = 'dump0' | |
_output = 'html0.tdf' | |
df = pd.DataFrame({'url': [], 'html': []}) | |
df.to_csv(_output, sep='\t', index=None) | |
def append_tdf(urls, html): |
Register utils.py using jython as utils; | |
urls = LOAD 'INPUT_FILE' USING PigStorage('\t') AS (url:chararray); | |
query = FOREACH urls GENERATE utils.query(url) AS everything; | |
file = FOREACH query GENERATE FLATTEN(everything); | |
STORE file INTO 's3n://OUTPUT_DIR' USING PigStorage('\t'); |
import json | |
import luigi | |
import luigi.hdfs | |
import luigi.hadoop | |
import pandas as pd | |
import numpy | |
import pandas | |
luigi.hadoop.attach(numpy, pandas) |
import re | |
import json | |
import luigi | |
import pandas as pd | |
from mysolr import Solr | |
from bs4 import BeautifulSoup | |
class InputText(luigi.ExternalTask): |