Skip to content

Instantly share code, notes, and snippets.

@Steampunkery
Created May 11, 2020 05:35
Show Gist options
  • Save Steampunkery/e948a91acf47ccb411080104227efd17 to your computer and use it in GitHub Desktop.
Save Steampunkery/e948a91acf47ccb411080104227efd17 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import re
import json
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import numpy as np
import tensorflow as tf
import sys
sys.path.insert(0, 'gpt-2/src/')
import model, sample, encoder
import requests
from timeit import default_timer as timer
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
def generate_chunks():
"""
Interactively run the model
:model_name=124M : String, which model to use
:seed=None : Integer seed for random number generators, fix seed to reproduce
results
:nsamples=1 : Number of samples to return total
:batch_size=1 : Number of batches (only affects speed/memory). Must divide nsamples.
:length=None : Number of tokens in generated text, if None (default), is
determined by model hyperparameters
:temperature=1 : Float value controlling randomness in boltzmann
distribution. Lower temperature results in less random completions. As the
temperature approaches zero, the model will become deterministic and
repetitive. Higher temperature results in more random completions.
:top_k=0 : Integer value controlling diversity. 1 means only 1 word is
considered for each step (token), resulting in deterministic completions,
while 40 means 40 words are considered at each step. 0 (default) is a
special setting meaning no restrictions. 40 generally is a good value.
:models_dir : path to parent folder containing model subfolders
(i.e. contains the <model_name> folder)
"""
model_name='124M'
seed=1
nsamples=1000
batch_size=1
length=25
temperature=1
top_k=0
top_p=1
models_dir='gpt-2/models'
models_dir = os.path.expanduser(os.path.expandvars(models_dir))
assert nsamples % batch_size == 0
enc = encoder.get_encoder(model_name, models_dir)
hparams = model.default_hparams()
with open(os.path.join(models_dir, model_name, 'hparams.json')) as f:
hparams.override_from_dict(json.load(f))
with tf.Session(graph=tf.Graph()) as sess:
context = tf.placeholder(tf.int32, [batch_size, None])
np.random.seed(seed)
tf.set_random_seed(seed)
output = sample.sample_sequence(
hparams=hparams, length=length,
context=context,
batch_size=batch_size,
temperature=temperature, top_k=top_k, top_p=top_p
)
saver = tf.train.Saver()
ckpt = tf.train.latest_checkpoint(os.path.join(models_dir, model_name))
saver.restore(sess, ckpt)
chunks = []
context_tokens = enc.encode("http://")
generated = 0
for _ in range(nsamples // batch_size):
start = timer()
out = sess.run(output, feed_dict={
context: [context_tokens for _ in range(batch_size)]
})[:, len(context_tokens):]
for i in range(batch_size):
generated += 1
text = enc.decode(out[i])
chunks.append("http://" + text)
end = timer()
print(end-start)
return chunks
def refine_chunks(chunks):
url_re = re.compile(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
for i in range(len(chunks)):
try:
chunks[i] = url_re.search(chunks[i]).group(0)
except:
chunks[i] = None
return chunks
def get_code(url):
try:
if url is not None:
r = requests.head(url, timeout=1).status_code
else:
r = -1
except:
r = -1
return r
def test_urls(urls):
return {k: get_code(k) for k in urls}
if __name__ == '__main__':
with open('result.json', 'w') as fp:
json.dump(test_urls(refine_chunks(generate_chunks())), fp)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment