R studio write to CSV without number-column - r

i wand import my data to CSV but without number Column or with numeric it
. My code Very sorry for my bad english:
tab<-read.csv2("CENY.csv")
#View(tab)
library(reshape2)
tab.m<-melt(tab,id.vars="Nazwa")
View(tab.m)
dim(tab.m)
tab.m<-tab.m[5:436,]
mies<-rep(c("sty","lut","mar","kwi","maj","cze","lip","sie","wrz","paź","lis","gru"),each=36)
produkt<-rep(rep(c("cytryny","marchew","cebula"),each=12),12)
rok<-rep(rep(c(2017,2018,2019),each=4),36)
length(mies)
length(produkt)
length(rok)
dane.m<-data.frame(tab.m$Nazwa,tab.m$value,mies=mies,produkt=produkt,rok=rok)
#View(dane.m)
X<-split(dane.m, dane.m$produkt)
str(X)
dane <- X$cebula[,-4]
colnames(dane)<-c("region","cebula","mies","rok")
dane$cytryny<-X$cytryny$tab.m.value
dane$marchew<-X$marchew$tab.m.value
#View(dane)
write.csv(dane, "dane-ceny.csv")
and i get:
"","region","cebula","mies","rok","cytryny","marchew"
"25","POLSKA",1.78,"sty",2017,6.56,1.64
"26","MAZOWIECKIE",1.59,"sty",2017,6.9,1.57
"27","OPOLSKIE",1.77,"sty",2017,7.05,1.85
"28","PODKARPACKIE",1.39,"sty",2017,5.83,1.4

Related

How do I import a file into r with extension .DUSMCPUB?

I’m trying to import the Mortality Multiple Cause Files from the National Center for Health Statistics, located at this link:
https://www.cdc.gov/nchs/data_access/vitalstatsonline.htm#Downloadable
link to image of where to find file on NCHS website
The files have an extension .DUSMCPUB (e.g., the file for 2020 is called "VS20MORT.DUSMCPUB_r20220105”). How do I import such a file? I’m not familiar with the extension.
I have tried to import with the following code, but it causes my R program to terminate. Can you please provide me with a suggestion on how to import these types of files?
VS20MORT <- read_delim("VS20MORT.DUSMCPUB_r20220105")
Thanks #Mel G for sharing this approach. When I tried to run it, I realized that the mortality file includes a few new variables as of 2020 (namely decedent’s occupation and industry). Here’s a slight variation that includes the new variables.
# Install and load necessary packages
# install.packages("sqldf") # Used to read in DUSMCPUB file
# install.packages("dplyr") # Used for tidy data management
library(sqldf)
library(dplyr)
#Increase memory limit to make space for large file
# memory.limit()
memory.limit(size=20000)
# Create dataframe containing variables for column width, name, and end position
columns <- data.frame(widths=c(19,1,40,2,1,1,2,2,1,4,1,2,2,2,2,1,1,1,16,4,1,1,1,
1,34,1,1,4,3,1,3,3,2,1,2,7,7,7,7,7,7,7,7,7,7,7,7,
7,7,7,7,7,7,7,7,36,2,1,5,5,5,5,5,5,5,5,5,5,5,5,5,
5,5,5,5,5,5,5,1,2,1,1,1,1,33,3,1,1,2,315,4,2,4,2))
columns$names <- c("blank1", # tape locations 1-19
"Resident_Status_US", # tape location 20
"blank2",
"Education_1989",
"Education_2003",
"Education_flag",
"Month_of_Death",
"blank3",
"Sex",
"DetailAge",
"Age_Substitution_Flag",
"Age_Recode_52",
"Age_Recode_27",
"Age_Recode_12",
"Infant_Age_Recode_22",
"Place_of_Death_and_Status",
"Marital_Status",
"Day_of_Week_of_Death",
"blank4",
"Current_Data_Year",
"Injury_at_Work",
"Manner_of_Death",
"Method_of_Disposition",
"Autopsy",
"blank5",
"Activity_Code",
"Place_of_Injury",
"ICD_Code_10",
"Cause_Recode_358",
"blank6",
"Cause_Recode_113",
"Infant_Cause_Recode_130",
"Cause_Recode_39",
"blank7",
"Number_Entity_Axis_Conditions",
"Condition_1EA", "Condition_2EA", "Condition_3EA", "Condition_4EA", "Condition_5EA",
"Condition_6EA", "Condition_7EA", "Condition_8EA", "Condition_9EA", "Condition_10EA",
"Condition_11EA", "Condition_12EA", "Condition_13EA", "Condition_14EA", "Condition_15EA",
"Condition_16EA", "Condition_17EA", "Condition_18EA", "Condition_19EA", "Condition_20EA",
"blank8",
"Number_Record_Axis_Conditions",
"blank9",
"Condition_1RA", "Condition_2RA", "Condition_3RA", "Condition_4RA", "Condition_5RA",
"Condition_6RA", "Condition_7RA", "Condition_8RA", "Condition_9RA", "Condition_10RA",
"Condition_11RA", "Condition_12RA", "Condition_13RA", "Condition_14RA", "Condition_15RA",
"Condition_16RA", "Condition_17RA", "Condition_18RA", "Condition_19RA", "Condition_20RA",
"blank10",
"Race",
"Bridged_Race_Flag",
"Race_Imputation_Flag",
"Race_Recode_3",
"Race_Recode_5",
"blank11",
"Hispanic_Origin",
"blank12",
"Hispanic_Origin_9_Race_Recode",
"Race_Recode_40",
"blank13",
"CensusOcc",
"Occ_26",
"CensusInd",
"Ind_23")
# Read in file using parameters from 'columns' dataframe
mort2020<- read.fwf("VS20MORT.DUSMCPUB_r20220105", widths=columns$widths, stringsAsFactors=F)
# Attach column names to variables
colnames(mort2020) <- columns$names
# Remove blank variables
mort2020x <- mort2020 %>% dplyr::select(-starts_with("blank"))
Alternatively, it looks like the files are published for most years in a CSV format here: https://www.nber.org/research/data/mortality-data-vital-statistics-nchs-multiple-cause-death-data. 2020 isn’t up yet, but for other years, it can be much faster to read a CSV into R than to use read.fwf.
The data is in the form of a fixed-width file. The user's guide to the data from the National Center for Health Statistics contains the appropriate widths. The answer I present is a modified answer from another forum, posted by #Hack-R.
https://opendata.stackexchange.com/questions/18375/how-can-one-interpret-the-nvss-mortality-multiple-cause-of-death-data-sets
map <- data.frame(widths=c(19, 1,40,2,1,1,2,2,1,1,1,1,1,1,2,2,2,2,1,1,1,16,4,1,1,1,1,34,1,1,4,
3,1,3,3,2,1,2,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
36,2,1,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,1,2,1,1,1,1,33,3,
1,1))
#Set column names
map$cn <- c("blank", # cols 1-19
"res_status", #20
"blank2", # 21-60
"ed_v89",#61-62
"ed_v03",#63
"ed_flag", #64
"death_month", #65-66
"blank3",
"sex",
"age_years",
"age_months",
"age_3",
"age_4",
"age_sub_flag",
"age_recode_52",
"age_recode_27",
"age_recode_12",
"infant_age_recode_22",
"place_of_death",
"marital_status",
"death_day",
"blank4",
"current_year",
"work_injury",
"death_manner",
"disposition",
"autopsy",
"blank5",
"activity_code",
"place_injured",
"icd_cause_of_death",
"cause_recode358",
"blank6",
"cause_recode113",
"infant_cause_recode130",
"cause_recode39",
"blank7",
"num_entity_axis",
"cond1","cond2","cond3","cond4","cond5","cond6","cond7","cond8","cond9","cond10",
"cond11","cond12","cond13","cond14","cond15","cond16","cond17","cond18","cond19",
"cond20",
"blank7",
"num_rec_axis_cond",
"blank8",
"acond1", "acond2", "acond3", "acond4", "acond5", "acond6", "acond7",
"acond8", "acond9", "acond10", "acond11", "acond12", "acond13", "acond14",
"acond15", "acond16", "acond17", "acond18", "acond19", "acond20",
"blank9",
"race",
"bridged_race_flag",
"race_imp_flag",
"race_recode3",
"race_recode5",
"blank10",
"hisp",
"blank11",
"hisp_recode")
#Import the file
mort2020 <- read_fwf("./data/original/VS20MORT.DUSMCPUB_r20220105", fwf_widths(map$widths, map$cn))

Extract attributes in XML using R

Trying to extract two attributes from the XML file extract (from a large XML file) namely 'nmRegime' and 'CalendarSystemT' (this is the date). Once extract those two records need to be saved as two columns in a data frame in R along with the filename.
There are several 'event' nodes within one given XML file and there are nearly 100 individual XML files.
<Event tEV="FirA" clearEV="false" onEV="true" dateOriginEV="Calendar" nYrsFromStEV="" nDaysFromStEV="" tFaqEV="Blank" tAaqEV="Blank" aqStYrEV="0" aqEnYrEV="0" nmEV="Fire_Cool" categoryEV="CatUndef" tEvent="Doc" idSP="105" nmRegime="Wheat, Tilled, stubble cool burn" regimeInstance="1">
<notesEV></notesEV>
<dateEV CalendarSystemT="FixedLength">19710331</dateEV>
<FirA fracAfctFirA="0.6" fracGbfrToAtmsFirA="0.98" fracStlkToAtmsFirA="0.98" fracLeafToAtmsFirA="0.98" fracGbfrToGlitFirA="0.02" fracStlkToSlitFirA="0.02" fracLeafToLlitFirA="0.02" fracCortToCodrFirA="1.0" fracFirtToFidrFirA="1.0" fracDGlitToAtmsFirA="0.931" fracRGlitToAtmsFirA="0.931" fracDSlitToAtmsFirA="0.931" fracRSlitToAtmsFirA="0.931" fracDLlitToAtmsFirA="0.931" fracRLlitToAtmsFirA="0.931" fracDCodrToAtmsFirA="0.0" fracRCodrToAtmsFirA="0.0" fracDFidrToAtmsFirA="0.0" fracRFidrToAtmsFirA="0.0" fracDGlitToInrtFirA="0.019" fracRGlitToInrtFirA="0.019" fracDSlitToInrtFirA="0.019" fracRSlitToInrtFirA="0.019" fracDLlitToInrtFirA="0.019" fracRLlitToInrtFirA="0.019" fracDCodrToInrtFirA="0.0" fracRCodrToInrtFirA="0.0" fracDFidrToInrtFirA="0.0" fracRFidrToInrtFirA="0.0" fracSopmToAtmsFirA="" fracLrpmToAtmsFirA="" fracMrpmToAtmsFirA="" fracSommToAtmsFirA="" fracLrmmToAtmsFirA="" fracMrmmToAtmsFirA="" fracMicrToAtmsFirA="" fracSopmToInrtFirA="" fracLrpmToInrtFirA="" fracMrpmToInrtFirA="" fracSommToInrtFirA="" fracLrmmToInrtFirA="" fracMrmmToInrtFirA="" fracMicrToInrtFirA="" fracMnamNToAtmsFirA="" fracSAmmNToAtmsFirA="" fracSNtrNToAtmsFirA="" fracDAmmNToAtmsFirA="" fracDNtrNToAtmsFirA="" fixFirA="" phaFirA="" />
</Event>
Had some success in extracting 'nmRegime' but no success with 'CalendarSystemT'. Used below code for data extraction.
The second question, is there a way to loop the list of XML files and do this operation?
# get records
library(xml2)
recs <- xml_find_all(xml, "//Event")
#extract the names
labs <- trimws(xml_attr(recs, "nmRegime"))
names <- labs[!is.na(labs)]
# Extract the date
recs_t <- xml_find_all(xml, "//Event/dateEV")
time <- trimws(xml_attr(recs_t, "CalendarSystemT"))
The calendar time value is not an attribute but is stored as the node's element and is accessed directly.
Also note that if an Event Node is missing a "dateEV" then there will be problems aligning the "labs" with the "time". It is better to extract the time value from each parent node instead of the entire document.
library(xml2)
library(dplyr)
xml<- read_xml('<Event tEV="FirA" clearEV="false" onEV="true" dateOriginEV="Calendar" nYrsFromStEV="" nDaysFromStEV="" tFaqEV="Blank" tAaqEV="Blank" aqStYrEV="0" aqEnYrEV="0" nmEV="Fire_Cool" categoryEV="CatUndef" tEvent="Doc" idSP="105" nmRegime="Wheat, Tilled, stubble cool burn" regimeInstance="1">
<notesEV></notesEV>
<dateEV CalendarSystemT="FixedLength">19710331</dateEV>
<FirA fracAfctFirA="0.6" fracGbfrToAtmsFirA="0.98" fracStlkToAtmsFirA="0.98" fracLeafToAtmsFirA="0.98" fracGbfrToGlitFirA="0.02" fracStlkToSlitFirA="0.02" fracLeafToLlitFirA="0.02" fracCortToCodrFirA="1.0" fracFirtToFidrFirA="1.0" fracDGlitToAtmsFirA="0.931" fracRGlitToAtmsFirA="0.931" fracDSlitToAtmsFirA="0.931" fracRSlitToAtmsFirA="0.931" fracDLlitToAtmsFirA="0.931" fracRLlitToAtmsFirA="0.931" fracDCodrToAtmsFirA="0.0" fracRCodrToAtmsFirA="0.0" fracDFidrToAtmsFirA="0.0" fracRFidrToAtmsFirA="0.0" fracDGlitToInrtFirA="0.019" fracRGlitToInrtFirA="0.019" fracDSlitToInrtFirA="0.019" fracRSlitToInrtFirA="0.019" fracDLlitToInrtFirA="0.019" fracRLlitToInrtFirA="0.019" fracDCodrToInrtFirA="0.0" fracRCodrToInrtFirA="0.0" fracDFidrToInrtFirA="0.0" fracRFidrToInrtFirA="0.0" fracSopmToAtmsFirA="" fracLrpmToAtmsFirA="" fracMrpmToAtmsFirA="" fracSommToAtmsFirA="" fracLrmmToAtmsFirA="" fracMrmmToAtmsFirA="" fracMicrToAtmsFirA="" fracSopmToInrtFirA="" fracLrpmToInrtFirA="" fracMrpmToInrtFirA="" fracSommToInrtFirA="" fracLrmmToInrtFirA="" fracMrmmToInrtFirA="" fracMicrToInrtFirA="" fracMnamNToAtmsFirA="" fracSAmmNToAtmsFirA="" fracSNtrNToAtmsFirA="" fracDAmmNToAtmsFirA="" fracDNtrNToAtmsFirA="" fixFirA="" phaFirA="" />
</Event>')
recs <- xml_find_all(xml, "//Event")
#extract the names
labs <- trimws(xml_attr(recs, "nmRegime")) names <- labs[!is.na(labs)]
# Extract the date
time <- xml_find_first(recs, ".//dateEV") %>% xml_text() %>% trimws()
To answer your second question, yes you could can wrap the above script into a function and then use lapply to loop through your entire list of files.
See this question and answer for details: R XML - combining parent and child nodes(w same name) into data frame

merge different files into 1 text file in R

I have two files with one being text, and the other being a data frame, now I just want to merge them into one as a text file. With linux, I can use:
cat file1 file2 > outputfile
I wonder if we can do the same thing with R?
file1
##TITLE=POOLED SAMPLES COLLECTED 05-06/03/2018
##JCAMP-DX=4.24
##DATA TYPE=LINK
#ORIGIN Bentley_FTS SomaCount_FCM 82048
##OWNER=Bentley Instruments Inc
##DATE=2018-03-09
##TIME=15:34:48
##BLOCKS=110
##LAB1=Auto Generated
##LAB2=
##CHANNELNAMES=8
file 2:
649.025085449219 0.063037105 0.021338785 -0.00053594 0.008937807 0.03266982
667.675231457819 0.028557044 0.005877694 -0.015043681 0.014945094 0.051547796
686.325377466418 0.021499421 0.017125281 0.043007832 0.04132269 0.027496092
704.975523475018 0.006128653 -0.014599532 -0.000335723 0.020189898 0.024547976
723.625669483618 0.018550962 0.018567896 0.014100821 0.013067127 0.027075281
742.275815492218 0.030145327 0.039745297 0.050556265 0.056621946 0.058416516
760.925961500818 0.040279277 0.01392867 -0.00143011 0.015103153 0.03580305
779.576107509418 0.031955898 0.013671243 0.000861743 0.000641993 0.001747168
Thanks alot
Phuong
We can use file.append:
file.append("fileMerged.txt", "file1.txt")
file.append("fileMerged.txt", "file2.txt")
Or if files are already imported into R, then write with append:
#import to R
f1 <- readLines("file1.txt")
f2 <- readLines("file2.txt")
# output with append
write(f1, "fileMerged.txt")
write(f2, "fileMerged.txt", append = TRUE)

incomplete list of csv file imported in R

I need to import a list of 36 csv files, but after running the code I get only 26 of them. Probably, 10 files have format problems. Is there a way in R to detect the 10 files that cannot be imported?
If you the file names in a list, you can use the following code:
all <- c("16048.txt", "16062.txt", "16066.txt", "16093.txt", "16095.txt", "16122.txt", "16241.txt", "16360.txt", "16380.txt", "16389.txt", "16510.txt", "16511.txt", "16701.txt", "16729.txt", "16735.txt", "16737.txt", "16761.txt", "16816.txt", "16867.txt", "16876.txt", "16880.txt", "16883.txt", "16884.txt", "16885.txt", "16893.txt", "16904.txt", "16906.txt", "16908.txt", "16929.txt", "16931.txt", "16938.txt", "16943.txt", "16959.txt", "16967.txt", "16968.txt", "16969.txt")
imp <- c("16761.txt", "16959.txt", "16884.txt", "16093.txt", "16883.txt", "16122.txt", "16906.txt", "16737.txt", "16968.txt", "16095.txt", "16062.txt", "16816.txt", "16360.txt", "16893.txt", "16885.txt", "16938.txt", "16048.txt", "16931.txt", "16876.txt", "16511.txt", "16969.txt", "16241.txt", "16967.txt", "16701.txt", "16380.txt", "16510.txt")
Where all is the list of filenames you need and imp is the imperfect result you got. You can get a list of the missing files with:
missing <- all[!all %in% imp]

warning in importing a CSV to R; "-" not meaningful for factors

I'm trying to import a set of dates from CSV to R.
Code:
dates <-- read.csv(file="dates.csv",header=FALSE)
Result:
Warning message:
In Ops.factor(left) : ‘-’ not meaningful for factors
The result is a bunch of NAs.
CSV file when opened in Notepad:
31/07/2014
30/07/2014
29/07/2014
28/07/2014
25/07/2014
24/07/2014
23/07/2014
22/07/2014
21/07/2014
18/07/2014
17/07/2014
16/07/2014
15/07/2014
14/07/2014
11/07/2014
10/07/2014
9/07/2014
8/07/2014
7/07/2014
4/07/2014
3/07/2014
2/07/2014
1/07/2014
30/06/2014
27/06/2014
26/06/2014
25/06/2014
24/06/2014
23/06/2014
20/06/2014
19/06/2014
18/06/2014
17/06/2014
16/06/2014
13/06/2014
12/06/2014
11/06/2014
10/06/2014
9/06/2014
6/06/2014
You can do this by using the <- operator, which is the standard assignment operator in R.
dates <- read.csv(file = "dates.csv", header = FALSE)
instead of the one you are typing above.
You could also read this for the different assignment operators used in R.

Resources