From 75829eb4a1bf2874c92a6d547589dc52cc31b5e4 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Wed, 8 Jun 2016 11:54:20 -0700 Subject: [PATCH] Initial recut of data import --- ccdb/utils/data.py | 69 +++++ ccdb/utils/data_import.py | 25 -- ccdb/utils/management/__init__.py | 0 ccdb/utils/management/commands/__init__.py | 0 ccdb/utils/management/commands/import_data.py | 277 ------------------ 5 files changed, 69 insertions(+), 302 deletions(-) create mode 100644 ccdb/utils/data.py delete mode 100644 ccdb/utils/data_import.py delete mode 100644 ccdb/utils/management/__init__.py delete mode 100644 ccdb/utils/management/commands/__init__.py delete mode 100644 ccdb/utils/management/commands/import_data.py diff --git a/ccdb/utils/data.py b/ccdb/utils/data.py new file mode 100644 index 0000000..998f619 --- /dev/null +++ b/ccdb/utils/data.py @@ -0,0 +1,69 @@ +import sqlite3 +import os +import json + +from django.conf import settings + +import requests +import dateutil.parser as dp + + +def get_data_sources(): + manifest_url = settings.MANIFEST_URL + if not manifest_url: + return None + data_dir = 'data/' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + _fetch_data(data_dir, manifest_url) + return { + 'db0': _get_db0(), + } + + +def _fetch_data(data_dir, url): + manifest = _filename(data_dir, url) + if not os.path.exists(manifest): + _write_url(url, manifest) + with open(manifest) as data: + d = json.load(data) + for f in d['files']: + p = _filename(data_dir, f) + if not os.path.exists(p): + _write_url(f, p) + + +def _filename(data_dir, url): + return ''.join([data_dir, url.split('/')[-1]]) + + +def _write_url(url, filename): + r = requests.get(url, stream=True) + with open(filename, 'wb') as outfile: + for chunk in r: + outfile.write(chunk) + + +def _get_db0(): + dbfile = 'data/Replica_Hibernators_Back_UAF_Laptop_29_June_2015.sqlite' + return setup_sqlite(dbfile) + + +def dtdt(s): + """ + This lets us parse whatever crazy date/time formats that + come our way (looking at you, MS Access) + """ + return dp.parse(s) + + +sqlite3.register_converter("dtdt", dtdt) + + +def setup_sqlite(dbfile): + if os.path.exists(dbfile): + db = sqlite3.connect(dbfile, detect_types=sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES) + db.row_factory = sqlite3.Row + return db.cursor() + else: + return None diff --git a/ccdb/utils/data_import.py b/ccdb/utils/data_import.py deleted file mode 100644 index e0cb2d5..0000000 --- a/ccdb/utils/data_import.py +++ /dev/null @@ -1,25 +0,0 @@ -import sqlite3 -import os - -import dateutil.parser as dp - - -def dtdt(s): - """ - This lets us parse whatever crazy date/time formats that - come our way (looking at you, MS Access) - """ - return dp.parse(s) - - -sqlite3.register_converter("dtdt", dtdt) - - -def setup_sqlite(): - dbfile = 'data/CC_Database_101314.sqlite' - if os.path.exists(dbfile): - db = sqlite3.connect(dbfile, detect_types=sqlite3.PARSE_DECLTYPES|sqlite3.PARSE_COLNAMES) - db.row_factory = sqlite3.Row - return db.cursor() - else: - return None diff --git a/ccdb/utils/management/__init__.py b/ccdb/utils/management/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/ccdb/utils/management/commands/__init__.py b/ccdb/utils/management/commands/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/ccdb/utils/management/commands/import_data.py b/ccdb/utils/management/commands/import_data.py deleted file mode 100644 index 511dfcd..0000000 --- a/ccdb/utils/management/commands/import_data.py +++ /dev/null @@ -1,277 +0,0 @@ -import os - -from django.core.management.base import BaseCommand -from django.db import IntegrityError - -import requests - -from ccdb.utils.data_import import setup_sqlite -from ccdb.projects.models import Project, Grant, GrantReport -from ccdb.misc.models import MeasurementUnit, MeasurementType, Container, \ - Material, Color -from ccdb.locations.models import Region, Site, MunicipalLocation, \ - StudyLocation, StorageLocation -from ccdb.species.models import Species, CollectionSpecies -from ccdb.processing.models import ProcessType, Reagent, \ - Flaw as ProcessingFlaw,Processing -from ccdb.collections_ccdb.models import CollectionType, CollectionMethod, \ - Flaw as CollectionFlaw, ADFGPermit, Collection -from ccdb.experiments.models import Flaw as ExperimentFlaw, Experiment, \ - ProtocolAttachment, TreatmentType, Treatment, TreatmentReplicate, \ - AliveDeadCount - - -class Command(BaseCommand): - help = 'Imports prior data into the DB' - - def add_arguments(self, parser): - parser.add_argument('manifest_url', type=str) - - def handle(self, **options): - _fetch_data(options['manifest_url'], self.stdout.write) - self.stdout.write('Fetched data') - _import_admin_data() - self.stdout.write('Imported data') - - -def _fetch_data(url, write): - data_dir = 'data/' - r = requests.get(url) - files = r.json() - if not os.path.exists(data_dir): - os.makedirs(data_dir) - for f in files['files']: - p = ''.join([data_dir, f.split('/')[-1]]) - if not os.path.exists(p): - write('Grabbing {}'.format(p)) - r = requests.get(f, stream=True) - with open(p, 'wb') as out_file: - for chunk in r: - out_file.write(chunk) - - -def _import_admin_data(): - c = setup_sqlite() - if c: - # Projects - for r in c.execute('SELECT * FROM tbl_lu_projects;'): - p = Project(id=r[0], name=r[1], code=r[2], iacuc_number=r[3], - description=r[4], sort_order=r[5]) - p.save() - - # Grants - for r in c.execute('SELECT * FROM tbl_lu_grants;'): - g = Grant(id=r[0], title=r[1], code=r[2], - description=r[3], sort_order=r[4]) - g.save() - - # Project-Grants - for r in c.execute('SELECT * FROM tbl_hash_project_grants;'): - p = Project.objects.get(id=r[0]) - g = Grant.objects.get(id=r[1]) - p.grants.add(g) - p.save() - - # Grant Reports - q = ''' - SELECT *, report_due_date AS "due_date [dtdt]" - FROM tbl_lu_grant_reports; - ''' - for r in c.execute(q): - # No PK field in Andre's file - gr = GrantReport(grant_id=r[0], title=r[1], report_type=r[2], - description=r[3], due_date=r[8], submitted_date=r[5], - attachment=r[6], sort_order=r[7]) - try: - gr.save() - except IntegrityError: - pass - - # Measurement Units - for r in c.execute('SELECT * FROM tbl_lu_measurement_units;'): - mu = MeasurementUnit(id=r[0], name=r[1], code=r[2], - unit_class=r[3], description=r[4], sort_order=r[5]) - mu.save() - - # Measurement Types - for r in c.execute('SELECT * FROM tbl_lu_measurement_types;'): - mt = MeasurementType(id=r[0], name=r[1], code=r[2], - measurement_type_class=r[3], description=r[4], - default_measurement_unit_id=r[5], sort_order=r[6]) - mt.save() - - # Materials - for r in c.execute('SELECT * FROM tbl_lu_materials;'): - m = Material(id=r[0], name=r[1], code=r[2], material_class=r[3], - description=r[4], sort_order=r[5]) - m.save() - - # Colors - for r in c.execute('SELECT * FROM tbl_lu_colors;'): - cl = Color(id=r[0], name=r[1], code=r[2], - color_number=r[3], sort_order=r[4]) - cl.save() - - # Containers - for r in c.execute('SELECT * FROM tbl_lu_containers;'): - cl = Container(id=r[0], name=r[1], code=r[2], application=r[3], - color_id=r[4], material_id=r[5], volume=r[6], - measurement_unit_id=r[7], sort_order=r[8]) - cl.save() - - # Regions - for r in c.execute('SELECT * FROM tbl_lu_regions;'): - re = Region(id=r[0], name=r[1], code=r[2], sort_order=r[3]) - re.save() - - # Site - for r in c.execute('SELECT * FROM tbl_lu_sites;'): - s = Site(region_id=r[0], id=r[1], name=r[2], code=r[3], - description=r[4], sort_order=r[5]) - s.save() - - # Municipal Locations - for r in c.execute('SELECT * FROM tbl_lu_municipal_locations;'): - ml = MunicipalLocation(id=r[1], name=r[2], code=r[3], - municipal_location_type=r[4], description=r[5], sort_order=r[6]) - ml.save() - - # Study Locations - for r in c.execute('SELECT * FROM tbl_lu_study_locations;'): - sl = StudyLocation(site_id=r[0], id=r[1], name=r[2], code=r[3], - study_location_type=r[4], treatment_type=r[5], - municipal_location_id=r[6], collecting_location=r[7], - description=r[13], sort_order=r[14]) - sl.save() - - # Storage Location - for r in c.execute('SELECT * FROM tbl_lu_storage_locations;'): - bldg = "".join(e[0].upper() for e in r[2].split()) - temp_c = '20' - if r[5]: - temp_c = r[5] - freezer = 'No Freezer' - if r[4]: - freezer = r[4] - code = " ".join([bldg, str(temp_c)+'C', str(freezer)]) - sl = StorageLocation(id=r[0], facility=r[1], building=r[2], - room=r[3], freezer=r[4], temp_c=r[5], code=code, - description=r[6], sort_order=r[7]) - sl.save() - - # Species - for r in c.execute('SELECT * FROM tbl_lu_species;'): - s = Species(id=r[0], common_name=r[1], genus=r[2], species=r[3], - parasite=r[4], sort_order=r[5]) - s.save() - - # Processing Type - for r in c.execute('SELECT * FROM tbl_lu_process_types;'): - pt = ProcessType(id=r[0], name=r[1], code=r[2], description=r[3], - sort_order=r[4]) - pt.save() - - # Reagent - for r in c.execute('SELECT * FROM tbl_lu_reagents;'): - rg = Reagent(id=r[0], name=r[1], code=r[2], reagent_class=r[3], - sort_order=r[4]) - rg.save() - - # Collection Type - for r in c.execute('SELECT * FROM tbl_lu_collection_types;'): - ct = CollectionType(id=r[0], name=r[1], code=r[2], sort_order=r[3]) - ct.save() - - # Collection Method - for r in c.execute('SELECT * FROM tbl_lu_collection_methods;'): - cm = CollectionMethod(id=r[0], name=r[1], code=r[2], - collection_method_class=r[3], sort_order=r[4]) - cm.save() - - # Collection - for r in c.execute(''' - SELECT *, - collection_start_date AS "collection_start_date [dtdt]", - collection_start_time AS "collection_start_time [dtdt]", - collection_end_date AS "collection_end_date [dtdt]", - collection_end_time AS "collection_end_time [dtdt]" - FROM tbl_collections; - '''): - if r[14] is not '': - permit, _ = ADFGPermit.objects.get_or_create(name=r[14]) - else: - permit = None - col = Collection(project_id=r[0], id=r[1], study_location_id=r[2], - collection_type_id=r[3], collection_method_id=r[4], - number_of_traps=r[5], collection_start_date=r[17], - collection_start_time=r[18], collection_end_date=r[19], - collection_end_time=r[20], storage_location_id=r[10], - specimen_state=r[11], process_type_id=r[12], reagent_id=r[13], - adfg_permit=permit) - col.save() - - # Collection Species - for r in c.execute('SELECT * FROM tbl_hash_collection_species;'): - # No PK field in Andre's file - cs = CollectionSpecies(collection_id=r[0], species_id=r[1], - sex=r[2], count=r[3], count_estimated=r[4]) - try: - cs.save() - except IntegrityError: - pass - - # Experiment - for r in c.execute('SELECT * FROM tbl_lu_experiments;'): - e = Experiment(id=r[0], name=r[1], code=r[2], - description=r[3], sort_order=r[6]) - e.save() - - # Treatment Type - for r in c.execute('SELECT * FROM tbl_lu_treatment_types;'): - tt = TreatmentType(experiment_id=r[0], id=r[1], name=r[2], code=r[3], - treatment_type=r[4], placement=r[5], description=r[6]) - tt.save() - - # Treatment - for r in c.execute('SELECT * FROM tbl_treatments;'): - t = Treatment(id=r[0], treatment_type_id=r[1], container_id=r[2], - study_location_id=r[3], species_id=r[4], sex=r[5]) - t.save() - - # Treatment Replicate - for r in c.execute(''' - SELECT *, setup_date AS "setup_date [dtdt]" - FROM tbl_treatment_replicates tr - LEFT OUTER JOIN tbl_lu_record_flaws f ON f.flawid=tr.flawid; - '''): - if r[7]: - flaw, _ = ExperimentFlaw.objects.get_or_create(name=r[10]) - else: - flaw = None - tr = TreatmentReplicate(treatment_id=r[0], id=r[1], name=r[2], - setup_date=r[13], setup_sample_size=r[5], mass_g=r[6], flaw=flaw) - tr.save() - - # Alive-Dead Count - for r in c.execute(''' - SELECT *, - status_date AS "status_date [dtdt]", - status_time AS "status_time [dtdt]" - FROM tbl_alive_dead_counts adc - LEFT OUTER JOIN tbl_lu_record_flaws f ON f.flawid=adc.flawid; - '''): - if r[6]: - flaw, _ = ExperimentFlaw.objects.get_or_create(name=r[9]) - else: - flaw = None - adc = AliveDeadCount(treatment_replicate_id=r[0], id=r[1], - status_date=r[12], status_time=r[13], count_alive=r[4], - count_dead=r[5], flaw=flaw) - adc.save() - - # Experiment-Collection - for r in c.execute('SELECT * FROM tbl_hash_collection_experiments;'): - c = Collection.objects.get(id=r[0]) - e = Experiment.objects.get(id=r[1]) - e.collections.add(c) - e.save()