Initial recut of data import

This commit is contained in:
Matthew Ryan Dillon 2016-06-08 11:54:20 -07:00
parent 6162bd984f
commit 75829eb4a1
5 changed files with 69 additions and 302 deletions

69
ccdb/utils/data.py Normal file
View file

@ -0,0 +1,69 @@
import sqlite3
import os
import json
from django.conf import settings
import requests
import dateutil.parser as dp
def get_data_sources():
manifest_url = settings.MANIFEST_URL
if not manifest_url:
return None
data_dir = 'data/'
if not os.path.exists(data_dir):
os.makedirs(data_dir)
_fetch_data(data_dir, manifest_url)
return {
'db0': _get_db0(),
}
def _fetch_data(data_dir, url):
manifest = _filename(data_dir, url)
if not os.path.exists(manifest):
_write_url(url, manifest)
with open(manifest) as data:
d = json.load(data)
for f in d['files']:
p = _filename(data_dir, f)
if not os.path.exists(p):
_write_url(f, p)
def _filename(data_dir, url):
return ''.join([data_dir, url.split('/')[-1]])
def _write_url(url, filename):
r = requests.get(url, stream=True)
with open(filename, 'wb') as outfile:
for chunk in r:
outfile.write(chunk)
def _get_db0():
dbfile = 'data/Replica_Hibernators_Back_UAF_Laptop_29_June_2015.sqlite'
return setup_sqlite(dbfile)
def dtdt(s):
"""
This lets us parse whatever crazy date/time formats that
come our way (looking at you, MS Access)
"""
return dp.parse(s)
sqlite3.register_converter("dtdt", dtdt)
def setup_sqlite(dbfile):
if os.path.exists(dbfile):
db = sqlite3.connect(dbfile, detect_types=sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES)
db.row_factory = sqlite3.Row
return db.cursor()
else:
return None

View file

@ -1,25 +0,0 @@
import sqlite3
import os
import dateutil.parser as dp
def dtdt(s):
"""
This lets us parse whatever crazy date/time formats that
come our way (looking at you, MS Access)
"""
return dp.parse(s)
sqlite3.register_converter("dtdt", dtdt)
def setup_sqlite():
dbfile = 'data/CC_Database_101314.sqlite'
if os.path.exists(dbfile):
db = sqlite3.connect(dbfile, detect_types=sqlite3.PARSE_DECLTYPES|sqlite3.PARSE_COLNAMES)
db.row_factory = sqlite3.Row
return db.cursor()
else:
return None

View file

@ -1,277 +0,0 @@
import os
from django.core.management.base import BaseCommand
from django.db import IntegrityError
import requests
from ccdb.utils.data_import import setup_sqlite
from ccdb.projects.models import Project, Grant, GrantReport
from ccdb.misc.models import MeasurementUnit, MeasurementType, Container, \
Material, Color
from ccdb.locations.models import Region, Site, MunicipalLocation, \
StudyLocation, StorageLocation
from ccdb.species.models import Species, CollectionSpecies
from ccdb.processing.models import ProcessType, Reagent, \
Flaw as ProcessingFlaw,Processing
from ccdb.collections_ccdb.models import CollectionType, CollectionMethod, \
Flaw as CollectionFlaw, ADFGPermit, Collection
from ccdb.experiments.models import Flaw as ExperimentFlaw, Experiment, \
ProtocolAttachment, TreatmentType, Treatment, TreatmentReplicate, \
AliveDeadCount
class Command(BaseCommand):
help = 'Imports prior data into the DB'
def add_arguments(self, parser):
parser.add_argument('manifest_url', type=str)
def handle(self, **options):
_fetch_data(options['manifest_url'], self.stdout.write)
self.stdout.write('Fetched data')
_import_admin_data()
self.stdout.write('Imported data')
def _fetch_data(url, write):
data_dir = 'data/'
r = requests.get(url)
files = r.json()
if not os.path.exists(data_dir):
os.makedirs(data_dir)
for f in files['files']:
p = ''.join([data_dir, f.split('/')[-1]])
if not os.path.exists(p):
write('Grabbing {}'.format(p))
r = requests.get(f, stream=True)
with open(p, 'wb') as out_file:
for chunk in r:
out_file.write(chunk)
def _import_admin_data():
c = setup_sqlite()
if c:
# Projects
for r in c.execute('SELECT * FROM tbl_lu_projects;'):
p = Project(id=r[0], name=r[1], code=r[2], iacuc_number=r[3],
description=r[4], sort_order=r[5])
p.save()
# Grants
for r in c.execute('SELECT * FROM tbl_lu_grants;'):
g = Grant(id=r[0], title=r[1], code=r[2],
description=r[3], sort_order=r[4])
g.save()
# Project-Grants
for r in c.execute('SELECT * FROM tbl_hash_project_grants;'):
p = Project.objects.get(id=r[0])
g = Grant.objects.get(id=r[1])
p.grants.add(g)
p.save()
# Grant Reports
q = '''
SELECT *, report_due_date AS "due_date [dtdt]"
FROM tbl_lu_grant_reports;
'''
for r in c.execute(q):
# No PK field in Andre's file
gr = GrantReport(grant_id=r[0], title=r[1], report_type=r[2],
description=r[3], due_date=r[8], submitted_date=r[5],
attachment=r[6], sort_order=r[7])
try:
gr.save()
except IntegrityError:
pass
# Measurement Units
for r in c.execute('SELECT * FROM tbl_lu_measurement_units;'):
mu = MeasurementUnit(id=r[0], name=r[1], code=r[2],
unit_class=r[3], description=r[4], sort_order=r[5])
mu.save()
# Measurement Types
for r in c.execute('SELECT * FROM tbl_lu_measurement_types;'):
mt = MeasurementType(id=r[0], name=r[1], code=r[2],
measurement_type_class=r[3], description=r[4],
default_measurement_unit_id=r[5], sort_order=r[6])
mt.save()
# Materials
for r in c.execute('SELECT * FROM tbl_lu_materials;'):
m = Material(id=r[0], name=r[1], code=r[2], material_class=r[3],
description=r[4], sort_order=r[5])
m.save()
# Colors
for r in c.execute('SELECT * FROM tbl_lu_colors;'):
cl = Color(id=r[0], name=r[1], code=r[2],
color_number=r[3], sort_order=r[4])
cl.save()
# Containers
for r in c.execute('SELECT * FROM tbl_lu_containers;'):
cl = Container(id=r[0], name=r[1], code=r[2], application=r[3],
color_id=r[4], material_id=r[5], volume=r[6],
measurement_unit_id=r[7], sort_order=r[8])
cl.save()
# Regions
for r in c.execute('SELECT * FROM tbl_lu_regions;'):
re = Region(id=r[0], name=r[1], code=r[2], sort_order=r[3])
re.save()
# Site
for r in c.execute('SELECT * FROM tbl_lu_sites;'):
s = Site(region_id=r[0], id=r[1], name=r[2], code=r[3],
description=r[4], sort_order=r[5])
s.save()
# Municipal Locations
for r in c.execute('SELECT * FROM tbl_lu_municipal_locations;'):
ml = MunicipalLocation(id=r[1], name=r[2], code=r[3],
municipal_location_type=r[4], description=r[5], sort_order=r[6])
ml.save()
# Study Locations
for r in c.execute('SELECT * FROM tbl_lu_study_locations;'):
sl = StudyLocation(site_id=r[0], id=r[1], name=r[2], code=r[3],
study_location_type=r[4], treatment_type=r[5],
municipal_location_id=r[6], collecting_location=r[7],
description=r[13], sort_order=r[14])
sl.save()
# Storage Location
for r in c.execute('SELECT * FROM tbl_lu_storage_locations;'):
bldg = "".join(e[0].upper() for e in r[2].split())
temp_c = '20'
if r[5]:
temp_c = r[5]
freezer = 'No Freezer'
if r[4]:
freezer = r[4]
code = " ".join([bldg, str(temp_c)+'C', str(freezer)])
sl = StorageLocation(id=r[0], facility=r[1], building=r[2],
room=r[3], freezer=r[4], temp_c=r[5], code=code,
description=r[6], sort_order=r[7])
sl.save()
# Species
for r in c.execute('SELECT * FROM tbl_lu_species;'):
s = Species(id=r[0], common_name=r[1], genus=r[2], species=r[3],
parasite=r[4], sort_order=r[5])
s.save()
# Processing Type
for r in c.execute('SELECT * FROM tbl_lu_process_types;'):
pt = ProcessType(id=r[0], name=r[1], code=r[2], description=r[3],
sort_order=r[4])
pt.save()
# Reagent
for r in c.execute('SELECT * FROM tbl_lu_reagents;'):
rg = Reagent(id=r[0], name=r[1], code=r[2], reagent_class=r[3],
sort_order=r[4])
rg.save()
# Collection Type
for r in c.execute('SELECT * FROM tbl_lu_collection_types;'):
ct = CollectionType(id=r[0], name=r[1], code=r[2], sort_order=r[3])
ct.save()
# Collection Method
for r in c.execute('SELECT * FROM tbl_lu_collection_methods;'):
cm = CollectionMethod(id=r[0], name=r[1], code=r[2],
collection_method_class=r[3], sort_order=r[4])
cm.save()
# Collection
for r in c.execute('''
SELECT *,
collection_start_date AS "collection_start_date [dtdt]",
collection_start_time AS "collection_start_time [dtdt]",
collection_end_date AS "collection_end_date [dtdt]",
collection_end_time AS "collection_end_time [dtdt]"
FROM tbl_collections;
'''):
if r[14] is not '':
permit, _ = ADFGPermit.objects.get_or_create(name=r[14])
else:
permit = None
col = Collection(project_id=r[0], id=r[1], study_location_id=r[2],
collection_type_id=r[3], collection_method_id=r[4],
number_of_traps=r[5], collection_start_date=r[17],
collection_start_time=r[18], collection_end_date=r[19],
collection_end_time=r[20], storage_location_id=r[10],
specimen_state=r[11], process_type_id=r[12], reagent_id=r[13],
adfg_permit=permit)
col.save()
# Collection Species
for r in c.execute('SELECT * FROM tbl_hash_collection_species;'):
# No PK field in Andre's file
cs = CollectionSpecies(collection_id=r[0], species_id=r[1],
sex=r[2], count=r[3], count_estimated=r[4])
try:
cs.save()
except IntegrityError:
pass
# Experiment
for r in c.execute('SELECT * FROM tbl_lu_experiments;'):
e = Experiment(id=r[0], name=r[1], code=r[2],
description=r[3], sort_order=r[6])
e.save()
# Treatment Type
for r in c.execute('SELECT * FROM tbl_lu_treatment_types;'):
tt = TreatmentType(experiment_id=r[0], id=r[1], name=r[2], code=r[3],
treatment_type=r[4], placement=r[5], description=r[6])
tt.save()
# Treatment
for r in c.execute('SELECT * FROM tbl_treatments;'):
t = Treatment(id=r[0], treatment_type_id=r[1], container_id=r[2],
study_location_id=r[3], species_id=r[4], sex=r[5])
t.save()
# Treatment Replicate
for r in c.execute('''
SELECT *, setup_date AS "setup_date [dtdt]"
FROM tbl_treatment_replicates tr
LEFT OUTER JOIN tbl_lu_record_flaws f ON f.flawid=tr.flawid;
'''):
if r[7]:
flaw, _ = ExperimentFlaw.objects.get_or_create(name=r[10])
else:
flaw = None
tr = TreatmentReplicate(treatment_id=r[0], id=r[1], name=r[2],
setup_date=r[13], setup_sample_size=r[5], mass_g=r[6], flaw=flaw)
tr.save()
# Alive-Dead Count
for r in c.execute('''
SELECT *,
status_date AS "status_date [dtdt]",
status_time AS "status_time [dtdt]"
FROM tbl_alive_dead_counts adc
LEFT OUTER JOIN tbl_lu_record_flaws f ON f.flawid=adc.flawid;
'''):
if r[6]:
flaw, _ = ExperimentFlaw.objects.get_or_create(name=r[9])
else:
flaw = None
adc = AliveDeadCount(treatment_replicate_id=r[0], id=r[1],
status_date=r[12], status_time=r[13], count_alive=r[4],
count_dead=r[5], flaw=flaw)
adc.save()
# Experiment-Collection
for r in c.execute('SELECT * FROM tbl_hash_collection_experiments;'):
c = Collection.objects.get(id=r[0])
e = Experiment.objects.get(id=r[1])
e.collections.add(c)
e.save()