Skip to content

Instantly share code, notes, and snippets.

@codersquid
Created August 10, 2015 15:12
Show Gist options
  • Save codersquid/c6a82d151a7f4b5f3b33 to your computer and use it in GitHub Desktop.
Save codersquid/c6a82d151a7f4b5f3b33 to your computer and use it in GitHub Desktop.
example scripts so that you can see making an item on archive.org with metadata for that item
#! /usr/bin/env python
# -*- coding: utf-8 -*-
"""
I had a script to upload some conference files to archive.org a while back, and I can't remember
if this is the file I used. It could be. It's very krufty. Also, the internetarchive package has
reved a few times since.
this is only for an example
"""
import argparse
import json
import logging
import os
import sys
from internetarchive import get_item
logging.basicConfig(
format='%(asctime)s|%(levelname)s|%(message)s',
datefmt='%Y-%m-%d %H:%M',
filename='archive.log',
level=logging.DEBUG,
)
"""
This is a convenience script that wrapps ia-wrapper calls
http://archive.org/help/abouts3.txt
MEH boto escapes things and the internetarchive library handles things ok
"""
ACCESS_KEY="example"
SECRET_KEY="example"
def prepare():
path = os.path.abspath('videos')
files = os.listdir(path)
with open('schedule.json') as fh:
schedule = json.load(fh)
return {
'files': files,
'path': path,
'schedule': schedule,
}
def upload_all(path, schedule):
files = os.listdir(path)
for f in files:
event_id = f.replace('.mp4', '')
if event_id not in schedule:
logging.warning('SKIPPING: %s', event_id)
continue
md = schedule[event_id]
name = '%s_event_%s' % (md['conference'].lower().replace(' ', '_'), event_id)
try:
item = get_item(name)
logging.debug("UPLOADING: event %s %s", event_id, name)
uploaded = item.upload(os.path.join(path, f), metadata=md, access_key=ACCESS_KEY, secret_key=SECRET_KEY)
if uploaded:
logging.info("SUCCESS: event %s %s", event_id, name)
else:
logging.info("FAILURE: event %s %s", event_id, name)
except:
e_type = sys.exc_info()[0]
logging.error("FAILURE: event %s %s type %s", event_id, name, e_type)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--schedule', default='schedule.json')
parser.add_argument('--videos', default='videos')
args = parser.parse_args()
with open(args.schedule) as fh:
schedule = json.load(fh)
videopath = os.path.abspath(args.videos)
upload_all(videopath, schedule)
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import argparse
import json
import logging
"""
Takes schedule xml from frab conference system for EuroPython2014 and PyData Berlin 2014
and creates json schedule with metadata fields for internet archive
http://archive.org/help/abouts3.txt
http://blog.archive.org/2011/03/31/how-archive-org-items-are-structured/
via email from collections-service@archive.org
The mediatypes are:
texts
audio
movies
image
The Community collections are:
opensource (this is for texts)
opensource_audio
opensource_movies
opensource_media (for items that are not text, audio or movies in type)
"""
logging.basicConfig(
format='%(asctime)s|%(levelname)s|%(message)s',
datefmt='%Y-%m-%d %H:%M',
filename='frab2json.log',
level=logging.DEBUG,
)
from bs4 import BeautifulSoup
def get_common_metadata():
return {
'mediatype': 'movies',
'collection': 'opensource_movies',
'type': 'conference',
}
def get_conference_metadata(event_id):
""" specific to europython2014 and pydataberlin2014"""
if int(event_id) < 20000:
return {
'is-part-of': 'https://ep2014.europython.eu/en/',
'conference': 'EuroPython 2014',
'subject': 'python; europython2014',
'year': '2014',
'location': 'Berlin, Germany',
}
else:
return {
'is-part-of': 'http://pydata.org/berlin2014/',
'conference': 'PyData Berlin 2014',
'subject': 'python; pydata; pydataberlin2014',
'year': '2014',
'location': 'Berlin, Germany',
}
def schedule_xml_to_dict(filename):
with open(filename, 'r') as fh:
soup = BeautifulSoup(fh)
events = soup.find_all('event')
schedule = {}
for e in events:
event_id = e.attrs['id'] # blow up if there is no key
if not e.title:
logging.warning('skipping event %s with no title', event_id)
continue
if e in schedule:
# ignore duplicates
logging.warning('skipping duplicate event %s', event_id)
continue
persons = e.find_all('person')
event = {
'extent': e.duration.text,
'title': e.title.text,
'date': e.date.text,
'speakers': [p.text for p in persons],
'schedule_event_type': e.type.text,
'schedule_event_id': event_id,
}
if e.language:
event['language'] = e.language.text
if e.abstract:
event['abstract'] = e.abstract.text
if e.description:
event['description'] = e.description.text
event.update(get_conference_metadata(event_id))
event.update(get_common_metadata())
schedule[event_id] = event
return schedule
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--frabxml', '-f', default='schedule.xml')
parser.add_argument('--output', '-o', default='schedule.json')
args = parser.parse_args()
d = schedule_xml_to_dict(args.frabxml)
with open(args.output, 'w') as fh:
json.dump(d, fh)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment