Example Queries
The following example queries use the database FellowsSample and the project ID S000.
--****************************************************************
--****************************************************************
--*** Explore your dataset
--****************************************************************
--****************************************************************
-- Determine the number of patients in the project.
select count(*) from FellowsSample.S000.PATIENT_DIMENSION;
-- View the data on the first 100 patients.
select top 100 * from FellowsSample.S000.PATIENT_DIMENSION;
-- View the first 100 observations about the patients.
select top 100 * from FellowsSample.S000.OBSERVATION_FACT;
-- Get a list of concepts (codes) and the number of patients with that code.
-- Save the results to a temp table. (Temp tables start with a "#".)
-- Note that this query will take several minutes to several hours depending
-- on the project and the number of other people running queries.
select concept_cd, count(distinct patient_num) num_patients, count(*) num_facts
into #concept_count
from FellowsSample.S000.OBSERVATION_FACT
group by concept_cd;
-- Join the list of concepts with the I2B2 view to get their names.
-- Save the results to a table in your project database (not a temp table).
select c.concept_cd, c.num_patients, c.num_facts, max(i.c_name) c_name
into S000.dbo.MyProjectConceptList
from #concept_count c
inner join FellowsSample.common.I2B2 i
on c.concept_cd = i.c_basecode
group by c.concept_cd, c.num_patients, c.num_facts;
-- Drop the temp table since we no longer need it.
drop table #concept_count;
-- View the top 100 concepts by number of patients.
select top 100 *
from S000.dbo.MyProjectConceptList
order by num_patients desc;
--****************************************************************
--****************************************************************
--*** Explore the ontology
--****************************************************************
--****************************************************************
-- View the top level "folders" in the ontology.
select top 100 *
from FellowsSample.common.i2b2
where c_hlevel<=1
order by c_fullname;
-- View the child concepts under Demographics.
select top 100 *
from FellowsSample.common.i2b2
where c_fullname like '\i2b2\Demographics\%'
and c_hlevel=2
order by c_fullname;
-- View the concept codes (c_basecode) for race.
select top 100 *
from FellowsSample.common.i2b2
where c_fullname like '\i2b2\Demographics\Race\%'
order by c_fullname;
-- Search for a concept by name.
select top 100 *
from FellowsSample.common.i2b2
where c_name like '%diabetes mellitus%'
order by c_fullname;
-- Use the concept dimension to get all codes under "E08-E13 Diabetes Mellitus"
select top 1000 *
from FellowsSample.common.CONCEPT_DIMENSION
where concept_path like '\i2b2\DiagnosisICD10V2_2018AAA20098492\A18921516\A18916268\%';
--****************************************************************
--****************************************************************
--*** Create a pivot table
--****************************************************************
--****************************************************************
-- Create a "pivot" with one row per patient and features in columns.
select patient_num
, max(case when concept_cd='ICD10CM:I10' then 1
else 0 end) HasHypertension
, max(case when concept_cd='ICD10CM:E11.9' then 1
else 0 end) HasType2Diabetes
, avg(case when concept_cd='LOINC:4548-4' then nval_num
else null end) MeanHbA1c
into #pivot
from FellowsSample.S000.OBSERVATION_FACT
where start_date >= '1/1/2018' and start_date < '1/1/2020'
and concept_cd in ('ICD10CM:I10','ICD10CM:E11.9','LOINC:4548-4')
group by patient_num;
-- Note that the above query is missing patients who do not have any of
-- the listed concepts within the given date range. We can add them here.
insert into #pivot
select patient_num, 0, 0, null
from FellowsSample.S000.PATIENT_DIMENSION
where patient_num not in (select patient_num from #pivot);
-- View the first 100 rows of the pivot to make sure it looks correct.
select top 100 * from #pivot;
-- Save the pivot to your project database.
select *
into S000.dbo.MyPivot
from #pivot;
-- Add a primary key to the new table.
alter table S000.dbo.MyPivot add primary key (patient_num);
-- Drop the temp table since we no longer need it.
drop table #pivot;
--****************************************************************
--****************************************************************
--*** Sample and partition patients
--****************************************************************
--****************************************************************
-- Select a random 1000 patients.
-- The function newid() creates a pseudorandom value.
select top 1000 patient_num
into #PatientSample
from FellowsSample.S000.PATIENT_DIMENSION
order by newid();
-- Create a primary key to speed up queries.
alter table #PatientSample add primary key (patient_num);
-- Determine the total number of visits for the sample.
select count(*)
from FellowsSample.S000.VISIT_DIMENSION
where patient_num in (select patient_num from #PatientSample);
-- Partition the sample into 5 equal partitions.
-- Assign patients in partition 5 (20%) to the test cohort.
-- Assign the remaining 80% of patients to the training cohort.
-- The function ntile(N) splits data into N partitions 1, 2, ..., N.
select patient_num,
(case when k=5 then 'Test' else 'Train' end) cohort
into #PatientCohort
from (
select patient_num, ntile(5) over (order by newid()) k
from #PatientSample
) t;
-- Create a table with the sex breakdown for each cohort.
select c.cohort
, count(*) Total
, sum(case when p.sex_cd='DEM|SEX:F' then 1 else 0 end) Female
, sum(case when p.sex_cd='DEM|SEX:M' then 1 else 0 end) Male
from FellowsSample.S000.PATIENT_DIMENSION p
inner join #PatientCohort c
on p.patient_num = c.patient_num
group by c.cohort
order by c.cohort;
-- Randomly assign a sequential "StudyID" number to each patient.
select patient_num, row_number() over (order by newid()) StudyID
into #PatientStudyID
from #PatientSample;
-- Drop the temp tables.
drop table #PatientSample;
drop table #PatientCohort;
drop table #PatientStudyID;
--****************************************************************
--****************************************************************
--*** Create a synonym
--****************************************************************
--****************************************************************
-- WARNING: This makes it easier to write queries.
-- However, it also makes it easier to make mistakes.
-- Make sure you are in your project database.
use S000;
-- Create a synonym in your project database.
create synonym patients
for FellowsSample.S000.PATIENT_DIMENSION;
-- Use the synonym to query data more easily.
select count(*) from patients;
-- Drop the synonym if no longer needed.
drop synonym patients;
--****************************************************************
--****************************************************************
--*** Link to ZCTA (ZIP Code Tabulation Areas) data
--****************************************************************
--****************************************************************
-- View the data on the first 100 records.
select top 100 * from FellowsSample.S000.zcta_cvs;
-- Get a list of all indicators in this table.
select domain, indicator, indicator_description
from FellowsSample.S000.zcta_cvs
group by domain, indicator, indicator_description;
-- Select 100 patients who live in zip codes with unemployment
-- greater than 10% from the American Community Survey.
select top 100 z.fact ACS_Unemployment, p.*
from FellowsSample.S000.PATIENT_DIMENSION p
inner join FellowsSample.dbo.zcta_cvs z
on left(p.zip_cd,5) = z.ZCTA
where z.indicator='ACS_Unemployment'
and cast(z.fact as float) > 10.0;
Last updated