Example queries

--****************************************************************
--****************************************************************
--*** Explore your dataset
--****************************************************************
--****************************************************************

-- Determine the number of patients in the project.
select count(*) from FellowsSample.S00.PATIENT_DIMENSION;

-- View the data on the first 100 patients.
select top 100 * from FellowsSample.S00.PATIENT_DIMENSION;

-- View the first 100 observations about the patients.
select top 100 * from FellowsSample.S00.OBSERVATION_FACT;

-- Get a list of concepts (codes) and the number of patients with that code.
-- Save the results to a temp table. (Temp tables start with a "#".)
-- Note that this query will take several minutes to several hours depending
-- on the project and the number of other people running queries.
select concept_cd, count(distinct patient_num) num_patients, count(*) num_facts
	into #concept_count
	from FellowsSample.S00.OBSERVATION_FACT
	group by concept_cd;

-- Join the list of concepts with the I2B2 view to get their names.
-- Save the results to a table in your project database (not a temp table).
select c.concept_cd, c.num_patients, c.num_facts, max(i.c_name) c_name
	into S00.dbo.MyProjectConceptList
	from #concept_count c
		inner join FellowsSample.common.I2B2 i
			on c.concept_cd = i.c_basecode
	group by c.concept_cd, c.num_patients, c.num_facts;

-- Drop the temp table since we no longer need it.
drop table #concept_count;

-- View the top 100 concepts by number of patients.
select top 100 *
	from S00.dbo.MyProjectConceptList
	order by num_patients desc;


--****************************************************************
--****************************************************************
--*** Explore the ontology
--****************************************************************
--****************************************************************

-- View the top level "folders" in the ontology.
select top 100 *
	from FellowsSample.common.i2b2
	where c_hlevel<=1
	order by c_fullname;

-- View the child concepts under Demographics.
select top 100 *
	from FellowsSample.common.i2b2
	where c_fullname like '\i2b2\Demographics\%'
		and c_hlevel=2
	order by c_fullname;

-- View the concept codes (c_basecode) for race.
select top 100 *
	from FellowsSample.common.i2b2
	where c_fullname like '\i2b2\Demographics\Race\%'
	order by c_fullname;

-- Search for a concept by name.
select top 100 *
	from FellowsSample.common.i2b2
	where c_name like '%diabetes mellitus%'
	order by c_fullname;

-- Use the concept dimension to get all codes under "E08-E13 Diabetes Mellitus"
select top 1000 *
	from FellowsSample.common.CONCEPT_DIMENSION
	where concept_path like '\i2b2\DiagnosisICD10V2_2018AAA20098492\A18921516\A18916268\%';


--****************************************************************
--****************************************************************
--*** Create a pivot table
--****************************************************************
--****************************************************************

-- Create a "pivot" with one row per patient and features in columns.
select patient_num
		, max(case when concept_cd='ICD10CM:I10' then 1
			else 0 end) HasHypertension
		, max(case when concept_cd='ICD10CM:E11.9' then 1 
			else 0 end) HasType2Diabetes
		, avg(case when concept_cd='LOINC:4548-4' then nval_num 
			else null end) MeanHbA1c
	into #pivot
	from FellowsSample.S00.OBSERVATION_FACT
	where start_date >= '1/1/2018' and start_date < '1/1/2020'
		and concept_cd in ('ICD10CM:I10','ICD10CM:E11.9','LOINC:4548-4')
	group by patient_num;
    
-- Note that the above query is missing patients who do not have any of
-- the listed concepts within the given date range. We can add them here.
insert into #pivot
	select patient_num, 0, 0, null
	from FellowsSample.S00.PATIENT_DIMENSION
	where patient_num not in (select patient_num from #pivot);

-- View the first 100 rows of the pivot to make sure it looks correct.
select top 100 * from #pivot;

-- Save the pivot to your project database.
select *
	into S00.dbo.MyPivot
	from #pivot;

-- Add a primary key to the new table.
alter table S00.dbo.MyPivot add primary key (patient_num);

-- Drop the temp table since we no longer need it.
drop table #pivot;


--****************************************************************
--****************************************************************
--*** Sample and partition patients
--****************************************************************
--****************************************************************

-- Select a random 1000 patients.
-- The function newid() creates a pseudorandom value.
select top 1000 patient_num
	into #PatientSample
	from FellowsSample.S00.PATIENT_DIMENSION
	order by newid();

-- Create a primary key to speed up queries.
alter table #PatientSample add primary key (patient_num);

-- Determine the total number of visits for the sample.
select count(*)
	from FellowsSample.S00.VISIT_DIMENSION
	where patient_num in (select patient_num from #PatientSample);

-- Partition the sample into 5 equal partitions.
-- Assign patients in partition 5 (20%) to the test cohort.
-- Assign the remaining 80% of patients to the training cohort.
-- The function ntile(N) splits data into N partitions 1, 2, ..., N.
select patient_num, 
		(case when k=5 then 'Test' else 'Train' end) cohort
	into #PatientCohort
	from (
		select patient_num, ntile(5) over (order by newid()) k
		from #PatientSample
	) t;

-- Create a table with the sex breakdown for each cohort.
select c.cohort
		, count(*) Total
		, sum(case when p.sex_cd='DEM|SEX:F' then 1 else 0 end) Female
		, sum(case when p.sex_cd='DEM|SEX:M' then 1 else 0 end) Male
	from FellowsSample.S00.PATIENT_DIMENSION p
		inner join #PatientCohort c
			on p.patient_num = c.patient_num
	group by c.cohort
	order by c.cohort;

-- Randomly assign a sequential "StudyID" number to each patient.
select patient_num, row_number() over (order by newid()) StudyID
	into #PatientStudyID
	from #PatientSample;

-- Drop the temp tables.
drop table #PatientSample;
drop table #PatientCohort;
drop table #PatientStudyID;


--****************************************************************
--****************************************************************
--*** Create a synonym
--****************************************************************
--****************************************************************

-- WARNING: This makes it easier to write queries.
-- However, it also makes it easier to make mistakes.

-- Make sure you are in your project database.
use S00;

-- Create a synonym in your project database.
create synonym patients
	for FellowsSample.S00.PATIENT_DIMENSION;

-- Use the synonym to query data more easily.
select count(*) from patients;

-- Drop the synonym if no longer needed.
drop synonym patients;


--****************************************************************
--****************************************************************
--*** Link to ZCTA (ZIP Code Tabulation Areas) data
--****************************************************************
--****************************************************************

-- View the data on the first 100 records.
select top 100 * from FellowsSample.S00.zcta_cvs;

-- Get a list of all indicators in this table.
select domain, indicator, indicator_description
	from FellowsSample.S00.zcta_cvs
	group by domain, indicator, indicator_description;

-- Select 100 patients who live in zip codes with unemployment
-- greater than 10% from the American Community Survey.
select top 100 z.fact ACS_Unemployment, p.*
	from FellowsSample.S00.PATIENT_DIMENSION p
		inner join FellowsSample.dbo.zcta_cvs z
			on left(p.zip_cd,5) = z.ZCTA
	where z.indicator='ACS_Unemployment' 
		and cast(z.fact as float) > 10.0;

Last updated