Greeting stackoverflow R community;
I have downloaded 10 data sets from CDC that correspond to mortality from 2003 to 2013. Each year is a separate data set. Some data sets have an extension of .DUSMCPUB and others have an extension of .dat. I have found a python script on github to parse 2013 .DUSMCPUB file and export it to .csv. However, I have not found anything to read the .dat files. I'm playing around and changing parameters with the data <- read.table(file = 'Mort03us.dat', header =TRUE, sep = ','). When I open the .dat file in a text editor I get this outcome.
I'm expecting headers on this data. However, I did read the documentation to see if the position of the values make any sense with the documentation, and doesn't.
I search the CDC website hoping to find documentation on how to read the file, or from what software this file was exported, or some information that might help to read the file, but no documentation I found.
Here I found a similar question, however, my .dat is not text and the script is not working for me. Also, I played around with the script without success.
Anyone of you have worked with CDC data sets in .dat format that can give me some guidance on how to import this data into RStudio?
July 1st, 2016:
Solution to this question:
I want to make an update to my question. I found a solution and I want to share with the community. Finally, I tweak a python script from github. I was able to parse the .dat file and import the data set into R. I want to thank you all!!
fileObj = open('Mort03us.dat', 'r')
fileOutObj = open('mort_2003.csv', 'a')
fileOutObj.write('Resident_Status, Education, Month_Of_Death, Sex, Age_Key, Age_Value, Age_Sub_Flag, Age_Recode_52, ' +
'Age_Recode_27, Age_Recode_12, Infant_Age_Recode_22, Place_Of_Death, Marital_Status, DOW_of_Death, ' +
'Data_Year, Injured_At_Work, Manner_Of_Death, Method_Of_Disposition, Autopsy, Activity_Code, ' +
'Place_Of_Causal_Injury, ICD10, Cause_Recode_358, Cause_Recode_113, Infant_Cause_Recode_130, ' +
'Cause_Recode_39, Entity_Axis_Conditions, EAC1, EAC2, EAC3, EAC4, EAC5, EAC6, EAC7, EAC8, EAC9, ' +
'EAC10, EAC11, EAC12, EAC13, EAC14, EAC15, EAC16, EAC17, EAC18, EAC19, EAC20, '
'Record_Axis_Conditions, RA1, RA2, RA3, RA4, RA5, RA6, RA7, RA8, RA9, RA10, RA11, RA12, RA13, RA14, ' +
'RA15, RA16, RA17, RA18, RA19, RA20, Race, Race_Bridged, Race_Imputation, Race_Recode_3, ' +
'Race_Recode_5, Hispanic_Origin, Hispanic_Origin_Recode\n')
outStr = ''
for line in fileObj:
Resident_Status = line[19].strip()
Education = line[60:62].strip()
Month_Of_Death = line[64:66].strip()
Sex = line[68].strip()
Age_Key = line[69].strip()
Age_Value = line[70:73].strip()
Age_Sub_Flag = line[73].strip()
Age_Recode_52 = line[74:76].strip()
Age_Recode_27 = line[76:78].strip()
Age_Recode_12 = line[78:80].strip()
Infant_Age_Recode_22 = line[80:82].strip()
Place_Of_Death = line[82].strip()
Marital_Status = line[83].strip()
DOW_of_Death = line[84].strip()
Data_Year = line[101:105].strip()
Injured_At_Work = line[105].strip()
Manner_Of_Death = line[106].strip()
Method_Of_Disposition = line[107].strip()
Autopsy = line[108].strip()
Activity_Code = line[143].strip()
Place_Of_Causal_Injury = line[144].strip()
ICD10 = line[145:149].strip()
Cause_Recode_358 = line[149:152].strip()
Cause_Recode_113 = line[153:156].strip()
Infant_Cause_Recode_130 = line[156:159].strip()
Cause_Recode_39 = line[159:161].strip()
Entity_Axis_Conditions = line[162:164].strip()
EAC1 = line[164:171].strip()
EAC2 = line[171:178].strip()
EAC3 = line[178:185].strip()
EAC4 = line[185:192].strip()
EAC5 = line[192:199].strip()
EAC6 = line[199:206].strip()
EAC7 = line[206:213].strip()
EAC8 = line[213:220].strip()
EAC9 = line[220:227].strip()
EAC10 = line[227:234].strip()
EAC11 = line[234:241].strip()
EAC12 = line[241:248].strip()
EAC13 = line[248:255].strip()
EAC14 = line[255:262].strip()
EAC15 = line[262:269].strip()
EAC16 = line[269:276].strip()
EAC17 = line[276:283].strip()
EAC18 = line[283:290].strip()
EAC19 = line[290:297].strip()
EAC20 = line[297:304].strip()
Record_Axis_Conditions = line[340:342]
RA1 = line[343:348].strip()
RA2 = line[348:353].strip()
RA3 = line[353:358].strip()
RA4 = line[358:363].strip()
RA5 = line[363:368].strip()
RA6 = line[368:373].strip()
RA7 = line[373:378].strip()
RA8 = line[378:383].strip()
RA9 = line[383:388].strip()
RA10 = line[388:393].strip()
RA11 = line[393:398].strip()
RA12 = line[398:403].strip()
RA13 = line[403:408].strip()
RA14 = line[408:413].strip()
RA15 = line[413:418].strip()
RA16 = line[418:423].strip()
RA17 = line[423:428].strip()
RA18 = line[428:433].strip()
RA19 = line[433:438].strip()
RA20 = line[438:443].strip()
Race = line[444:446].strip()
Race_Bridged = line[446].strip()
Race_Imputation = line[447].strip()
Race_Recode_3 = line[448].strip()
Race_Recode_5 = line[449].strip()
Hispanic_Origin = line[483:486].strip()
Hispanic_Origin_Recode = line[487].strip()
outStr = (Resident_Status + ', ' + Education + ', ' + Month_Of_Death + ', ' + Sex +
', ' + Age_Key + ', ' + Age_Value + ', ' + Age_Sub_Flag + ', ' + Age_Recode_52 +
', ' + Age_Recode_27 + ', ' + Age_Recode_12 + ', ' + Infant_Age_Recode_22 + ', ' + Place_Of_Death +
', ' + Marital_Status + ', ' + DOW_of_Death + ', ' + Data_Year + ', ' + Injured_At_Work +
', ' + Manner_Of_Death + ', ' + Method_Of_Disposition + ', ' + Autopsy + ', ' + Activity_Code +
', ' + Place_Of_Causal_Injury + ', ' + ICD10 + ', ' + Cause_Recode_358 + ', ' + Cause_Recode_113 +
', ' + Infant_Cause_Recode_130 + ', ' + Cause_Recode_39 + ', ' + Entity_Axis_Conditions + ', ' + EAC1 +
', ' + EAC2 + ', ' + EAC3 + ', ' + EAC4 + ', ' + EAC5 +
', ' + EAC6 + ', ' + EAC7 + ', ' + EAC8 + ', ' + EAC9 +
', ' + EAC10 + ', ' + EAC11 + ', ' + EAC12 + ', ' + EAC13 +
', ' + EAC14 + ', ' + EAC15 + ', ' + EAC16 + ', ' + EAC17 +
', ' + EAC18 + ', ' + EAC19 + ', ' + EAC20 + ', ' + Record_Axis_Conditions +
', ' + RA1 + ', ' + RA2 + ', ' + RA3 + ', ' + RA4 +
', ' + RA5 + ', ' + RA6 + ', ' + RA7 + ', ' + RA8 +
', ' + RA9 + ', ' + RA10 + ', ' + RA11 + ', ' + RA12 +
', ' + RA13 + ', ' + RA14 + ', ' + RA15 + ', ' + RA16 +
', ' + RA17 + ', ' + RA18 + ', ' + RA19 + ', ' + RA20 +
', ' + Race + ', ' + Race_Bridged + ', ' + Race_Imputation + ', ' + Race_Recode_3 +
', ' + Race_Recode_5 + ', ' + Hispanic_Origin + ', ' + Hispanic_Origin_Recode + '\n')
fileOutObj.write(outStr)
print("Parse complete.")
fileOutObj.close()
fileObj.close()
This code will work with Python 3.
Regardless of the file extension, it's merely a text file (88MB, with 2.4M rows). On inspection, it looks fixed-width (at 430 chars wide, a bit wide to show here, though your picture does it justice).
The only "trick" to reading in FWF files is to find the column widths. In some "dat" files, this is well-defined/documented, but it can often be inferred by looking at the first few lines. In this case, I just counted characters. (I only grab the first 100 lines for brevity.)
Some of the fields appear to be space-delimited. That is, some (even most) of the rows had a space between two groupings but some rows had none. Since I don't know how to decode the logic, I keep those together (with the space). (Namely: V6, V8, and V9.)
mort <- read.fwf("~/Downloads/foo/Mort03us.dat",
widths = c(30, 26, 6, 4, 7, 12, 24, 43, 9, 178, 3, 100, 4, 4, 36, 2),
header = FALSE, n = 100, stringsAsFactors = FALSE)
str(mort)
# 'data.frame': 100 obs. of 16 variables:
# $ V1 : chr " 11AK999AK9AK" " 11AK999AK9AK" " 11AK999AK9AK" " 11AK999AK9AK" ...
# $ V2 : chr " AK9999999992AK00009900OR" " AK9999999992AK00009900NY" " AK9999999992AK00009900WA" " AK9999999992AK00009900TX" ...
# $ V3 : chr " OR14" " NY17" " WA15" " TX16" ...
# $ V4 : int 1 1 1 1 1 1 1 1 1 1 ...
# $ V5 : chr " F1079" " F1088" " F1059" " F1084" ...
# $ V6 : chr " 412110 4W4" " 432311 1W6" " 371708 6D7" " 422210 4W7" ...
# $ V7 : chr " 2003U7UN" " 2003U7UN" " 2003U7UN" " 2003U7UN" ...
# $ V8 : chr " I361226" " I499228" " C64 117" " C349093" ...
# $ V9 : chr " 068 22" " 068 22" " 034 12" " 027 08" ...
# $ V10: chr " 0711I500 21I361 31I10 61I679 62I739 63M199 64N289 "| __truncated__ " 0311R568 21I959 31I499 "| __truncated__ " 0111C64 "| __truncated__ " 0111C349 "| __truncated__ ...
# $ V11: int 7 3 1 1 2 3 4 4 1 1 ...
# $ V12: chr " I10 I361 I500 I679 I739 M199 N289 " " I499 I959 R568 " " C64 " " C349 " ...
# $ V13: int 1 1 1 3 3 1 1 1 1 1 ...
# $ V14: int 11 11 11 23 23 11 11 11 11 11 ...
# $ V15: int 100 100 100 100 100 100 100 100 100 100 ...
# $ V16: int 6 6 6 8 8 6 6 6 6 6 ...
There is a lot of white-space, and the read.fwf function doesn't automatically remove it for you, so we'll have to take care of that next.
# chars <- sapply(mort, is.character)
mort[,chars] <- sapply(mort[,chars], function(z) {
gsub("[[:space:]]+", " ", trimws(z))
}, simplify = FALSE)
str(mort)
# 'data.frame': 100 obs. of 16 variables:
# $ V1 : chr "11AK999AK9AK" "11AK999AK9AK" "11AK999AK9AK" "11AK999AK9AK" ...
# $ V2 : chr "AK9999999992AK00009900OR" "AK9999999992AK00009900NY" "AK9999999992AK00009900WA" "AK9999999992AK00009900TX" ...
# $ V3 : chr "OR14" "NY17" "WA15" "TX16" ...
# $ V4 : int 1 1 1 1 1 1 1 1 1 1 ...
# $ V5 : chr "F1079" "F1088" "F1059" "F1084" ...
# $ V6 : chr "412110 4W4" "432311 1W6" "371708 6D7" "422210 4W7" ...
# $ V7 : chr "2003U7UN" "2003U7UN" "2003U7UN" "2003U7UN" ...
# $ V8 : chr "I361226" "I499228" "C64 117" "C349093" ...
# $ V9 : chr "068 22" "068 22" "034 12" "027 08" ...
# $ V10: chr "0711I500 21I361 31I10 61I679 62I739 63M199 64N289" "0311R568 21I959 31I499" "0111C64" "0111C349" ...
# $ V11: int 7 3 1 1 2 3 4 4 1 1 ...
# $ V12: chr "I10 I361 I500 I679 I739 M199 N289" "I499 I959 R568" "C64" "C349" ...
# $ V13: int 1 1 1 3 3 1 1 1 1 1 ...
# $ V14: int 11 11 11 23 23 11 11 11 11 11 ...
# $ V15: int 100 100 100 100 100 100 100 100 100 100 ...
# $ V16: int 6 6 6 8 8 6 6 6 6 6 ...
The last bit to take into account is that some of the uber-wide fields (V10 and V12) appear to be variable lists of codes. Since I wasn't confident I knew exactly how many, etc, I chose to keep them together, too. (This would be a good time to learn something about some more of Hadley's packages, specifically broom and purrr. This youtube video is an hour of Hadley talking about nested dataframes using these packages.)
(For the record, it does take a while to load the whole file and do this processing, so be patient. My recommendation is to "practice" on a modest subset of the entire file to make sure your processing is doing what you want. When you have your steps mapped out and tested, then run them all and go get a coffee.)
I am able to evaluate today's date using:
String timeframe = DateTime.newInstance(Date.today(), Time.newInstance(0, 0, 0, 0)).format('yyyy-dd-MM');
But I need to add days to this. So I want:
String timeframe = DateTime.newInstance(Date.today()**.addDays(10)**, Time.newInstance(0, 0, 0, 0)).format('yyyy-dd-MM');
query += ' and (created_date__c <= ' + timeframe + ')';
But am receiving this error: line 1:392 no viable alternative at character '2'
I also tried:
DateTime refDate1 = DateTime.newInstance(System.today().year(), System.today().month(), System.today().day(), 0, 0, 0);
DateTime refDate2 = refDate1.addDays(-10);
query += ' and created_date__c <= '+ refDate1.date() + ' and created_date__c >= ' + refDate2.date();
Which throws: line 1:405 no viable alternative at character ' '
Please help!
I resolved this using code I'd already tried, which makes me think there was something wrong with my soql string to begin with.
String timeframe = DateTime.newInstance(System.today().addDays(-10),
Time.newInstance(0, 0, 0,0)).format('yyyy-dd-MM');
whereClause += ' and (';
whereClause += 'created_date__c >= ' + timeframe;
whereClause += ' OR sourced_date__c >= ' + timeframe;
whereClause += ' OR last_phone_bank_call_date__c >=' + timeframe;
whereClause += ')';
I am trying to set the time format in my calendar rows to appear as 1pm - 2pm, 2pm - 3pm, 3pm- 4pm, etc.
I have tried the following:
agenda: 'h:mm{ - h:mm}',
axisFormat: 'h:mm{ - h:mm}',
day: 'h:mm{ - h:mm}',
axisFormat: 'h(:mm)tt',
timeFormat: {
agenda: 'h:mm{ - h:mm}'
},
but none of the above, either alone or in combination seem to work for me.
I initialize my calendar as below:
$('#calendar').fullCalendar({
defaultView: 'agendaDay',
allDaySlot: false,
firstHour: 9,
minTime: 9,
maxTime: 19,
selectable: true,
unselectAuto: true,
slotMinutes: 60,
weekends: false,
year: current_year,
month: current_month,
date: current_day,
columnFormat: '',
FullCalendar does not provide a "to" time for the axisFormat and therefore the { - h:mm} part of you axisFormat is ignored.
And I don't think there's any way to do it without editing the FullCalendar source code.
But if you are feeling adventurous you could do the following changes in fullcalendar.js around line 3207:
d = zeroDate();
maxd = addMinutes(cloneDate(d), maxMinute);
addMinutes(d, minMinute);
// Add two lines
var toD = cloneDate(d);
addMinutes(toD, opt('slotMinutes'));
slotCnt = 0;
for (i=0; d < maxd; i++) {
minutes = d.getMinutes();
s +=
"<tr class='fc-slot" + i + ' ' + (!minutes ? '' : 'fc-minor') + "'>" +
"<th class='fc-agenda-axis " + headerClass + "'>" +
// Changed line from "formatDate(d, opt('axisFormat')...)" to "formatDates(d, toD, opt('axisFormat')...)"
((!slotNormal || !minutes) ? formatDates(d, toD, opt('axisFormat')) : ' ') +
"</th>" +
"<td class='" + contentClass + "'>" +
"<div style='position:relative'> </div>" +
"</td>" +
"</tr>";
addMinutes(d, opt('slotMinutes'));
// Add line
addMinutes(toD, opt('slotMinutes'));
slotCnt++;
}