Conditional means based on other columns in R with dplyr - r

Let's say I have the following data:
structure(list(political_spectrum = c(5L, 15L, 12L, 30L, 100L,
0L, 27L, 52L, 38L, 64L, 0L, 0L, 76L, 50L, 16L, 16L, 0L, 23L,
0L, 25L, 68L, 50L, 4L, 0L, 50L), politics_today = c("Independent",
"Strong Democrat", "Weak Democrat", "Weak Democrat", "Weak Republican",
"Strong Democrat", "Weak Democrat", "Weak Democrat", "Independent",
"Weak Democrat", "Strong Democrat", "Independent", "Weak Republican",
"Weak Democrat", "Weak Democrat", "Strong Democrat", "Strong Democrat",
"Strong Democrat", "Strong Democrat", "Strong Democrat", "Independent",
"Independent", "Strong Democrat", "Strong Democrat", "Independent"
), stranger_things_universe_mc = c("The Demagorgon", "", "",
"", "", "", "", "", "", "The Stranger Land", "The Demagorgon",
"The Upside Down", "", "", "", "", "", "The Upside Down", "The Shadowland",
"", "", "", "", "", "The Shadowland"), stranger_things_universe_answer = c("The Upside Down",
"", "", "", "", "", "", "", "", "The Upside Down", "The Upside Down",
"The Upside Down", "", "", "", "", "", "The Upside Down", "The Upside Down",
"", "", "", "", "", "The Upside Down"), stranger_things_universe_confidence = c(32L,
NA, NA, NA, NA, NA, NA, NA, NA, 67L, 94L, 89L, NA, NA, NA, NA,
NA, 51L, 10L, NA, NA, NA, NA, NA, 0L), stranger_things_universe_importance = c("Don't care at all",
"", "", "", "", "", "", "", "", "Care somewhat strongly", "Care a little",
"Care somewhat strongly", "", "", "", "", "", "Care somewhat",
"Don't care at all", "", "", "", "", "", "Don't care at all"),
tupac_mc = c("", "Biggie Smalls", "", "", "", "", "", "Biggie Smalls",
"Biggie Smalls", "", "", "Biggie Smalls", "", "", "", "",
"", "", "Biggie Smalls", "", "", "Ice Cube", "", "", ""),
tupac_answer = c("", "Biggie Smalls", "", "", "", "", "",
"Biggie Smalls", "Biggie Smalls", "", "", "Biggie Smalls",
"", "", "", "", "", "", "Biggie Smalls", "", "", "Biggie Smalls",
"", "", ""), tupac_confidence = c(NA, 70L, NA, NA, NA, NA,
NA, 71L, 76L, NA, NA, 100L, NA, NA, NA, NA, NA, NA, 100L,
NA, NA, 32L, NA, NA, NA), tupac_importance = c("", "Don't care at all",
"", "", "", "", "", "Care somewhat", "Don't care at all",
"", "", "Care strongly", "", "", "", "", "", "", "Care a little",
"", "", "Don't care at all", "", "", ""), uber_ceo_mc = c("John Zimmer",
"", "", "", "", "Travis Kalanick", "", "", "", "Travis Kalanick",
"", "", "", "", "", "", "", "John Zimmer", "Travis Kalanick",
"Travis Kalanick", "", "", "", "", ""), uber_ceo_answer = c("Travis Kalanick",
"", "", "", "", "Travis Kalanick", "", "", "", "Travis Kalanick",
"", "", "", "", "", "", "", "Travis Kalanick", "Travis Kalanick",
"Travis Kalanick", "", "", "", "", ""), uber_ceo_confidence = c(0L,
NA, NA, NA, NA, 94L, NA, NA, NA, 69L, NA, NA, NA, NA, NA,
NA, NA, 5L, 13L, 17L, NA, NA, NA, NA, NA), uber_ceo_importance = c("Don't care at all",
"", "", "", "", "Care strongly", "", "", "", "Care somewhat",
"", "", "", "", "", "", "", "Don't care at all", "Don't care at all",
"Care somewhat", "", "", "", "", ""), black_panther_mc = c("",
"T'Chaka", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "T'Chaka", "", ""), black_panther_answer = c("",
"T'Challa", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "T'Challa", "", ""), black_panther_confidence = c(NA,
63L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 34L, NA, NA), black_panther_importance = c("",
"Don't care at all", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "Care a little",
"", ""), the_office_mc = c("The Mindy Project", "", "", "",
"", "", "", "", "", "", "", "", "", "", "The Office", "",
"", "The Mindy Project", "", "", "", "", "The Office", "",
""), the_office_answer = c("The Office", "", "", "", "",
"", "", "", "", "", "", "", "", "", "The Office", "", "",
"The Office", "", "", "", "", "The Office", "", ""), the_office_confidence = c(43L,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 2L, NA,
NA, 11L, NA, NA, NA, NA, 100L, NA, NA), the_office_importance = c("Don't care at all",
"", "", "", "", "", "", "", "", "", "", "", "", "", "Don't care at all",
"", "", "Care a little", "", "", "", "", "Care a little",
"", ""), arms_manufacturing_company_mc = c("J. Brockton & Sons",
"", "", "O.F. Mossberg & Sons", "", "", "", "", "", "", "",
"", "J. Brockton & Sons", "", "", "", "", "", "", "", "",
"", "", "", "J. Brockton & Sons"), arms_manufacturing_company_answer = c("J. Brockton & Sons",
"", "", "J. Brockton & Sons", "", "", "", "", "", "", "",
"", "J. Brockton & Sons", "", "", "", "", "", "", "", "",
"", "", "", "J. Brockton & Sons"), arms_manufacturing_company_confidence = c(91L,
NA, NA, 24L, NA, NA, NA, NA, NA, NA, NA, NA, 37L, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 100L), arms_manufacturing_company_importance = c("Don't care at all",
"", "", "Don't care at all", "", "", "", "", "", "", "",
"", "Don't care at all", "", "", "", "", "", "", "", "",
"", "", "", "Don't care at all")), class = c("data.table",
"data.frame"), row.names = c(NA, -25L))
I'm trying to do something like the following:
test %>%
gather(name, value, -c('political_spectrum', 'politics_today')) %>%
filter(value != "") %>%
mutate(question_id = sub("_[^_]+$", "", name)) %>%
mutate(confidence = grepl("_confidence", name)) %>%
group_by(politics_today, question_id) %>%
summarize(mean_confidence = mean(value[confidence == "TRUE"]))
in which I get the mean_confidence values for each political affiliation, but only for specific rows in the "value" column. In order to run the mean only on "confidence" columns, I am trying to do a filter via mean(value[confidence == "TRUE"]), but am not sure the correct way to do this.

I think you need to change your code to
library(tidyverse)
test %>%
gather(name, value, -c('political_spectrum', 'politics_today')) %>%
filter(value != "") %>%
mutate(question_id = sub("_[^_]+$", "", name),
confidence = grepl("_confidence", name)) %>%
group_by(politics_today, question_id) %>%
summarize(mean_confidence = mean(as.numeric(value[confidence])))
# politics_today question_id mean_confidence
# <chr> <chr> <dbl>
# 1 Independent arms_manufacturing_company 95.5
# 2 Independent stranger_things_universe 40.3
# 3 Independent the_office 43
# 4 Independent tupac 69.3
# 5 Independent uber_ceo 0
# 6 Strong Democrat black_panther 48.5
# 7 Strong Democrat stranger_things_universe 51.7
# 8 Strong Democrat the_office 55.5
# 9 Strong Democrat tupac 85
#10 Strong Democrat uber_ceo 32.2
#11 Weak Democrat arms_manufacturing_company 24
#12 Weak Democrat stranger_things_universe 67
#13 Weak Democrat the_office 2
#14 Weak Democrat tupac 71
#15 Weak Democrat uber_ceo 69
#16 Weak Republican arms_manufacturing_company 37
Since your value column has got both numeric and character values, it gets converted to a character column so you need to change the value where confidence == TRUE to numeric.

Related

Need help merging string data from column that runs into below rows. Problem in multiple columns, leaving empty data in cells for other columns

In a nutshell - I have multiple columns in my data frame and some of the columns have string data that spill into the rows below, which means that those near-empty rows only have info for those spillover columns. I would like to merge the rows, and combine all the string data into that specific cell for that column with the spillover issue (I need to do this all in R please...). I also have this problem in different columns, and it does not happen in every row.... This is hard to explain with words, but my output below explains the problem the best. I figured that a dput output would be better than pasting a table here so that people could actually use this with code. This is a very simplified version of my data frame and the problem.
structure(list(SECTION = c(10207L, NA, 14097L, NA, NA, NA, NA,
21290L, NA, 3359L, NA, NA, NA, NA, 50903L, NA), SCHOOL = c("ACAD",
"", "ACCT", "", "", "", "", "ANSC", "", "LAW", "", "", "", "",
"XPPD", "PPD"), COURSE_CODE = c("ACAD-181", "", "ACCT-410", "",
"", "", "", "PR-463", "", "LAW-680A", "", "", "", "", "PPDE-630",
""), COURSE_TITLE = c("Disruptive Innovation", "", "Foundations of Accounting",
"", "", "", "", "Strategic Public Relations Research, Analysis",
"and Insights", "Review of Law and Social Justice Editing", "",
"", "", "", "Community Health Planning", ""), INSTRUCTOR_NAME = c("Smith, Tim",
"Bob, Scott", "Gem, Silvia", "", "", "", "", "OBrien, James",
"", "Harvey, Tony", "", "", "", "", "Sloth, Ryan", ""), ASSIGNED_ROOM = c("IYH210/211",
"", "ONLINE", "", "", "", "", "ONLINE", "", "ONLINE", "", "",
"", "", "ONLINE", ""), TOTAL_ENR = c(32L, NA, 55L, NA, NA, NA,
NA, 17L, NA, 13L, NA, NA, NA, NA, 16L, NA), COURSE_DESCRIPTION = c("Critical approaches to social and cultural changes.",
"", "Non-technical presentation of accounting for users of accounting",
"information; introduction to financial and managerial accounting.",
"Not open to students with course credits in accounting. Not",
"available for unit or course credit toward a degree in accounting",
"or business administration.", "Identification of key strategic insights.",
"", "Supervision of research and writing, and final editing of articles",
"and comments for publication in the Review of Law and Social",
"Justice. For officers of the Review. Open to law students only.",
"Graded IP to CR/D/F.", "", "The role of planning in sustaining community health.",
"")), class = "data.frame", row.names = c(NA, -16L))
I think this will work.
library(tidyverse)
X <- structure(list(SECTION = c(10207L, NA, 14097L, NA, NA, NA, NA, 21290L, NA, 3359L, NA, NA, NA, NA, 50903L, NA),
SCHOOL = c("ACAD", "", "ACCT", "", "", "", "", "ANSC", "", "LAW", "", "", "", "", "XPPD", "PPD"),
COURSE_CODE = c("ACAD-181", "", "ACCT-410", "", "", "", "", "PR-463", "", "LAW-680A", "", "", "", "", "PPDE-630", ""),
COURSE_TITLE = c("Disruptive Innovation", "", "Foundations of Accounting", "", "", "", "", "Strategic Public Relations Research, Analysis", "and Insights", "Review of Law and Social Justice Editing", "", "", "", "", "Community Health Planning", ""),
INSTRUCTOR_NAME = c("Smith, Tim", "Bob, Scott", "Gem, Silvia", "", "", "", "", "OBrien, James", "", "Harvey, Tony", "", "", "", "", "Sloth, Ryan", ""),
ASSIGNED_ROOM = c("IYH210/211", "", "ONLINE", "", "", "", "", "ONLINE", "", "ONLINE", "", "", "", "", "ONLINE", ""),
TOTAL_ENR = c(32L, NA, 55L, NA, NA, NA, NA, 17L, NA, 13L, NA, NA, NA, NA, 16L, NA),
COURSE_DESCRIPTION = c("Critical approaches to social and cultural changes.", "", "Non-technical presentation of accounting for users of accounting", "information; introduction to financial and managerial accounting.", "Not open to students with course credits in accounting. Not", "available for unit or course credit toward a degree in accounting", "or business administration.", "Identification of key strategic insights.", "", "Supervision of research and writing, and final editing of articles", "and comments for publication in the Review of Law and Social", "Justice. For officers of the Review. Open to law students only.", "Graded IP to CR/D/F.", "", "The role of planning in sustaining community health.", "")),
class = "data.frame", row.names = c(NA, -16L))
X_collapsed <- X
for(i in seq(nrow(X), 2, -1)) { # Work on the table in bottom to top so we can merge the values
if(is.na(X_collapsed[i, "SECTION"])) { # only work on rows with NA in the SECTION column
# Work on the current row and the previous row.
# Don't modify numeric columns (don't want to merge NA values).
# Use lead() to check across rows, and turn NA values into an empty string.
# Use paste() to combine the row values.
# use trim() to get rid of any excess white space produced.
X_collapsed[i-1,] <- (X_collapsed[c(i-1,i),] %>%
mutate(across(.fns = ~ ifelse(is.numeric(.x), .x, trim(paste(.x, ifelse(is.na(lead(.x)), "", lead(.x)))))))
)[1, ]
}
}
X_collapsed <- X_collapsed %>%
filter(!is.na(SECTION)) # remove rows we don't want.
X_collapsed
This also removes extra whitespace due to the use of trim(). Without it you may end up with trailing spaces.

Conditional str_remove based on data frame column

I have a dataframe (pasted below), in which I am trying to set to blank the value of one column based on the value of another column. The idea is that if X6 equals Nbre CV or if X6equals Nbre BVD then I want X6for that row to be blank.
Unfortunately using the following code the entire X6 column turns to NA or missing.
extractstack <- extractstack %>%
mutate(across(everything(), as.character) %>%
mutate(X6 = if_else(X6 == `Nbre CV`, str_remove(X6, `Nbre CV`), X6)) %>%
mutate(X6 = if_else(X6 == `Nbre CV`, str_remove(X6, `Nbre BVD`), X6)))
structure(list(X1 = c("", "", "40", "", "", "41", "", "", "42",
"", "", "43", "", "", "44", ""), X2 = c("", "", "EP. KAPALA",
"", "", "INST. MOTULE", "", "", "CABANE BABOA", "", "", "CABANE BANANGI",
"", "", "E.P.BINZI", ""), X3 = c("", "", "MOBATI-BOYELE", "",
"", "MOBATI-BOYELE", "", "", "MOBATI-BOYELE", "", "", "AVURU-GATANGA",
"", "", "AVURU-GATANGA", ""), X4 = c("", "", "BOGBASA", "", "",
"BOSOBEA", "", "", "BOSOBEA", "", "", "BANANGI", "", "", "GURUZA",
""), X5 = c("", "", "", "", "", "MOBENGE", "", "", "BABOA", "",
"", "DIFONGO", "", "", "DULIA", ""), X6 = c("", "", "BOGBASA",
"", "", "", "1", "", "", "1", "", "", "1", "", "", "1"), X7 = c("1",
"", "", "1", "", "", "4", "", "", "1", "", "", "1", "", "", "5"
), X8 = c("2", "", "", "2", "", "", "510 110", "", "", "510 111",
"", "", "510 112", "", "", "510 113"), X9 = c("510 108", "",
"", "510 109", "", "", "A - D", "", "", "A", "", "", "A", "",
"", "A - E"), page = c("4", "4", "4", "4", "5", "5", "5", "5",
"5", "5", "5", "5", "5", "5", "5", "5"), Plage = c("A - B", NA,
NA, "A - B", NA, NA, "A - D", NA, NA, "A", NA, NA, "A", NA, NA,
"A - E"), `Code SV` = c("510 108", NA, NA, "510 109", NA, NA,
"510 110", NA, NA, "510 111", NA, NA, "510 112", NA, NA, "510 113"
), `Nbre BVD` = c("2", NA, NA, "2", NA, NA, "4", NA, NA, "1",
NA, NA, "1", NA, NA, "5"), `Nbre CV` = c("1", NA, NA, "1", NA,
NA, "1", NA, NA, "1", NA, NA, "1", NA, NA, "1")), class = "data.frame", row.names = c(NA,
-16L))
That's basically Chris Ruehlemann's answer (I don't know why he removed it, I would remove this one for the original one):
library(dplyr)
extractstack %>%
mutate(across(everything(), as.character),
X6 = coalesce(ifelse(X6 == `Nbre BVD` | X6 == `Nbre CV`, "", X6), X6))
compares X6 with the columns Nbre BVD and Nbre CV. If there is matching content, X6 will be changed to an empty string "", else X6 stays unchanged. But for your given data, this code doesn't replace anything, since there are simply no matches in X6 with Nbre BVD and Nbre CV besides NA-values.

How do I convert a dataset with 1 column per day to only 1 date column (days) with R

I have a dataset with birds observations. I have one column for each day of the month (31). If the bird is seen this day, it has the "place" info (where it has been seen). Can someone help we with a code that can merge these 31 columns (days) into one date column and another second column with the "place" information? I think I can use the "dcast" function from "reshape2" package but I don't know how to use it to keep the two informations (day and place).
Here is the structure of my dataset:
bird_data = structure(
list(
ID = c(
"FB37461",
"FA42342",
"FA42261",
"FB37329",
"FA42332",
"FH60963",
"FB37473",
"FB37593",
"FA85545",
"FC10619"
),
Name = c(
"Dekort",
"Simon",
"Devil",
"Chimere",
"Private Norman",
"Aurresku",
"Rombus",
"Tan?",
"Taiwan",
"Bakugo"
),
Lring = c("",
"ID/RD", "", "DB", "", "YL/WT", "", "", "", "ID/DG"),
Rring = c("",
"DB", "", "MV/ID", "", "ID", "", "", "", "O"),
sex = c("M", "F",
"F", "U", "F", "F", "U", "J", "F", "J"),
month = c(
"October",
"December",
"July",
"April",
"November",
"November",
"March",
"April",
"August",
"March"
),
year = c(
2016L,
2018L,
2015L,
2018L,
2016L,
2018L,
2015L,
2015L,
2016L,
2018L
),
seen = c(1L, 0L, 1L,
1L, 1L, 0L, 0L, 1L, 1L, 1L),
Freq = c(13L, 0L, 9L, 10L, 4L, 0L,
0L, 7L, 5L, 14L),
`1` = c("S", "", "A", "", "", "", "", "", "",
"AS"),
`2` = c("", "", "", "", "", "", "", "", "", "A"),
`3` = c("",
"", "", "A", "", "", "", "", "", ""),
`4` = c("S", "", "", "A",
"S", "", "", "", "", ""),
`5` = c("S", "", "", "A", "", "", "",
"", "", ""),
`6` = c("", "", "", "", "", "", "", "", "", "A"),
`7` = c("S", "", "", "", "", "", "", "A", "", "A"),
`8` = c("",
"", "A", "", "", "", "", "A", "", "S"),
`9` = c("", "", "",
"", "", "", "", "", "", "A"),
`10` = c("", "", "A", "A",
"", "", "", "A", "", ""),
`11` = c("", "", "", "", "", "",
"", "A", "", ""),
`12` = c("A", "", "", "A", "", "", "",
"", "", ""),
`13` = c("S", "", "", "AS", "", "", "", "",
"", "A"),
`14` = c("", "", "AF", "", "", "", "", "A", "",
"S"),
`15` = c("", "", "A", "", "", "", "", "", "", ""),
`16` = c("", "", "A", "", "S", "", "", "A", "", "S"),
`17` = c("",
"", "A", "A", "", "", "", "A", "F", ""),
`18` = c("AS", "",
"A", "", "S", "", "", "", "", ""),
`19` = c("", "", "", "",
"", "", "", "", "", ""),
`20` = c("S", "", "", "A", "", "",
"", "", "", "S"),
`21` = c("S", "", "", "", "", "", "", "",
"", ""),
`22` = c("", "", "", "", "", "", "", "", "", "S"),
`23` = c("", "", "", "", "", "", "", "", "A", ""),
`24` = c("",
"", "", "", "S", "", "", "", "", ""),
`25` = c("S", "", "",
"", "", "", "", "", "S", ""),
`26` = c("S", "", "", "A",
"", "", "", "", "F", ""),
`27` = c("F", "", "", "A", "",
"", "", "", "", "S"),
`28` = c("S", "", "", "", "", "", "",
"", "", ""),
`29` = c("", "", "A", "", "", "", "", "", "",
"S"),
`30` = c("", "", "", "", "", "", "", "", "AF", "S"),
`31` = c("", "", "", "", "", "", "", "", "", "")
),
row.names = c(
11419L,
21637L,
7186L,
17878L,
11678L,
21385L,
6290L,
6640L,
10785L,
17740L
),
class = "data.frame"
)
For example at row 9, we have "Taiwan" with 5 observations in August 2016. She has been seen on the 17th, 23rd, 25th, 26th, 30th. So I want to do 5 lines (5 observations) with one column for date and another one for place seen (F, A, S).
Easiest way is simply to select the columns you want to pivot and put everything else into some other format.
bird_data %>%
pivot_longer(cols = paste(1:31),
names_to = "day",
values_to = 'location') %>%
filter(location != "",
Name == "Taiwan")
I use the paste() fx to force the integers into characters. The last filter step is just to 1. get rid of the null/empty rows and 2. display the rows for Taiwan.
Once you've pivoted it longer, you can figure out what you want to do with the excess rows for Freq and other cols. It's easy to drop other columns in the cols argument for pivot_longer().

right_join and mutate does not preserve the index in R

I am Mapping column_data to master and if column value is present in master than it saves it Key
ex:Parent for P and Child for C
Problem is i am getting the output but output is indexed differently
DATA
column_data <- c("", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "P", "C", "C")
master <- list("Parent" = c("P"),
"Child" = c("C")
)
CODE
library(dplyr)
df <- data.frame("column" = column_data)
df <-stack(master) %>%
type.convert(as.is = TRUE) %>%
right_join(df, by = c('values' = 'column')) %>%
mutate(output = coalesce(ind, values))
This Should be the output:
structure(list(values = c("", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "P", "C", "C"), ind = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Parent",
"Child", "Child"), output = c("", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "Parent", "Child", "Child")), class = "data.frame", row.names = c(NA,
-19L))
but instead i get this as output:
structure(list(values = c("P", "C", "C", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", ""), ind = c("Parent",
"Child", "Child", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), output = c("Parent", "Child", "Child", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "")), row.names = c(NA,
-19L), class = "data.frame")
With dplyr, if you do a right_join(x, y) then the result will include a subset of the matched rows for x, then unmatched rows for y.
From R documentation on mutating joins, the value returned will be:
An object of the same type as x. The order of the rows and columns of
x is preserved as much as possible. The output has the following
properties:
For inner_join(), a subset of x rows. For left_join(), all x rows. For
right_join(), a subset of x rows, followed by unmatched y rows. For
full_join(), all x rows, followed by unmatched y rows.
That is why you have the 3 matched rows at the beginning of your resulting data.frame.
To get the desired result preserving the row order of df, try a left_join as follows:
df2 <- stack(master) %>%
type.convert(as.is = TRUE)
df %>%
left_join(df2, by = c('column' = 'values')) %>%
mutate(output = coalesce(ind, column))
Output
column ind output
1 <NA>
2 <NA>
3 <NA>
4 <NA>
5 <NA>
6 <NA>
7 <NA>
8 <NA>
9 <NA>
10 <NA>
11 <NA>
12 <NA>
13 <NA>
14 <NA>
15 <NA>
16 <NA>
17 P Parent Parent
18 C Child Child
19 C Child Child

How to read csv file for text mining

I will be using tm for text mining purpose.However, my file CSV file is weired .Below is the dput,after I used read.table function in r. There are three column lie, sentiment and review. However the fourth coulmn contain review with no column name.I am New to R and Text mining. If I use read.csvit is getting me an error. Please suggest better approach for reading csv file.
Update:
> dput(head(df))
structure(list(V1 = c("lie,sentiment,review", "f,n,'Mike\\'s",
"f,n,'i", "f,n,'After", "f,n,'Olive", "f,n,'I"), V2 = c("", "Pizza",
"really", "I", "Oil", "went"), V3 = c("", "High", "like", "went",
"Garden", "to"), V4 = c("", "Point,", "this", "shopping", "was",
"the"), V5 = c("", "NY", "buffet", "with", "very", "Chilis"),
V6 = c("", "Service", "restaurant", "some", "disappointing.",
"on"), V7 = c("", "was", "in", "of", "I", "Erie"), V8 = c("",
"very", "Marshall", "my", "expect", "Blvd"), V9 = c("", "slow",
"street.", "friend,", "good", "and"), V10 = c("", "and",
"they", "we", "food", "had"), V11 = c("", "the", "have",
"went", "and", "the"), V12 = c("", "quality", "a", "to",
"good", "worst"), V13 = c("", "was", "lot", "DODO", "service",
"meal"), V14 = c("", "low.", "of", "restaurant", "(at", "of"
), V15 = c("", "You", "selection", "for", "least!!)", "my"
), V16 = c("", "would", "of", "dinner.", "when", "life."),
V17 = c("", "think", "american,", "I", "I", "We"), V18 = c("",
"they", "japanese,", "found", "go", "arrived"), V19 = c("",
"would", "and", "worm", "out", "and"), V20 = c("", "know",
"chinese", "in", "to", "waited"), V21 = c("", "at", "dishes.",
"one", "eat.", "5"), V22 = c("", "least", "we", "of", "The",
"minutes"), V23 = c("", "how", "also", "the", "meal", "for"
), V24 = c("", "to", "got", "dishes", "was", "a"), V25 = c("",
"make", "a", ".'", "cold", "hostess,"), V26 = c("", "good",
"free", "", "when", "and"), V27 = c("", "pizza,", "drink",
"", "we", "then"), V28 = c("", "not.", "and", "", "got",
"were"), V29 = c("", "Stick", "free", "", "it,", "seated"
), V30 = c("", "to", "refill.", "", "and", "by"), V31 = c("",
"pre-made", "there", "", "the", "a"), V32 = c("", "dishes",
"are", "", "waitor", "waiter"), V33 = c("", "like", "also",
"", "had", "who"), V34 = c("", "stuffed", "different", "",
"no", "was"), V35 = c("", "pasta", "kinds", "", "manners",
"obviously"), V36 = c("", "or", "of", "", "whatsoever.",
"in"), V37 = c("", "a", "dessert.", "", "Don\\'t", "a"),
V38 = c("", "salad.", "the", "", "go", "terrible"), V39 = c("",
"You", "staff", "", "to", "mood."), V40 = c("", "should",
"is", "", "the", "We"), V41 = c("", "consider", "very", "",
"Olive", "order"), V42 = c("", "dining", "friendly.", "",
"Oil", "drinks"), V43 = c("", "else", "it", "", "Garden.",
"and"), V44 = c("", "where.'", "is", "", "\nf,n,", "it"),
V45 = c("", "", "also", "", "The", "took"), V46 = c("", "",
"quite", "", "Seven", "them"), V47 = c("", "", "cheap", "",
"Heaven", "15"), V48 = c("", "", "compared", "", "restaurant",
"minutes"), V49 = c("", "", "with", "", "was", "to"), V50 = c("",
"", "the", "", "never", "bring"), V51 = c("", "", "other",
"", "known", "us"), V52 = c("", "", "restaurant", "", "for",
"both"), V53 = c("", "", "in", "", "a", "the"), V54 = c("",
"", "syracuse", "", "superior", "wrong"), V55 = c("", "",
"area.", "", "service", "beers"), V56 = c("", "", "i", "",
"but", "which"), V57 = c("", "", "will", "", "what", "were"
), V58 = c("", "", "definitely", "", "we", "barely"), V59 = c("",
"", "coming", "", "experienced", "cold."), V60 = c("", "",
"back", "", "last", "Then"), V61 = c("", "", "here.'", "",
"week", "we"), V62 = c("", "", "", "", "was", "order"), V63 = c("",
"", "", "", "a", "an"), V64 = c("", "", "", "", "disaster.",
"appetizer"), V65 = c("", "", "", "", "The", "and"), V66 = c("",
"", "", "", "waiter", "wait"), V67 = c("", "", "", "", "would",
"25"), V68 = c("", "", "", "", "not", "minutes"), V69 = c("",
"", "", "", "notice", "for"), V70 = c("", "", "", "", "us",
"cold"), V71 = c("", "", "", "", "until", "southwest"), V72 = c("",
"", "", "", "we", "egg"), V73 = c("", "", "", "", "asked",
"rolls,"), V74 = c("", "", "", "", "him", "at"), V75 = c("",
"", "", "", "4", "which"), V76 = c("", "", "", "", "times",
"point"), V77 = c("", "", "", "", "to", "we"), V78 = c("",
"", "", "", "bring", "just"), V79 = c("", "", "", "", "us",
"paid"), V80 = c("", "", "", "", "the", "and"), V81 = c("",
"", "", "", "menu.", "left."), V82 = c("", "", "", "", "The",
"Don\\'t"), V83 = c("", "", "", "", "food", "go.'"), V84 = c("",
"", "", "", "was", ""), V85 = c("", "", "", "", "not", ""
), V86 = c("", "", "", "", "exceptional", ""), V87 = c("",
"", "", "", "either.", ""), V88 = c("", "", "", "", "It",
""), V89 = c("", "", "", "", "took", ""), V90 = c("", "",
"", "", "them", ""), V91 = c("", "", "", "", "though", ""
), V92 = c("", "", "", "", "2", ""), V93 = c("", "", "",
"", "minutes", ""), V94 = c("", "", "", "", "to", ""), V95 = c("",
"", "", "", "bring", ""), V96 = c("", "", "", "", "us", ""
), V97 = c("", "", "", "", "a", ""), V98 = c("", "", "",
"", "check", ""), V99 = c("", "", "", "", "after", ""), V100 = c("",
"", "", "", "they", ""), V101 = c("", "", "", "", "spotted",
""), V102 = c("", "", "", "", "we", ""), V103 = c("", "",
"", "", "finished", ""), V104 = c("", "", "", "", "eating",
""), V105 = c("", "", "", "", "and", ""), V106 = c("", "",
"", "", "are", ""), V107 = c("", "", "", "", "not", ""),
V108 = c("", "", "", "", "ordering", ""), V109 = c("", "",
"", "", "more.", ""), V110 = c("", "", "", "", "Well,", ""
), V111 = c("", "", "", "", "never", ""), V112 = c("", "",
"", "", "more.", ""), V113 = c("", "", "", "", "\nf,n,",
""), V114 = c("", "", "", "", "I", ""), V115 = c("", "",
"", "", "went", ""), V116 = c("", "", "", "", "to", ""),
V117 = c("", "", "", "", "XYZ", ""), V118 = c("", "", "",
"", "restaurant", ""), V119 = c("", "", "", "", "and", ""
), V120 = c("", "", "", "", "had", ""), V121 = c("", "",
"", "", "a", ""), V122 = c("", "", "", "", "terrible", ""
), V123 = c("", "", "", "", "experience.", ""), V124 = c("",
"", "", "", "I", ""), V125 = c("", "", "", "", "had", ""),
V126 = c("", "", "", "", "a", ""), V127 = c("", "", "", "",
"YELP", ""), V128 = c("", "", "", "", "Free", ""), V129 = c("",
"", "", "", "Appetizer", ""), V130 = c("", "", "", "", "coupon",
""), V131 = c("", "", "", "", "which", ""), V132 = c("",
"", "", "", "could", ""), V133 = c("", "", "", "", "be",
""), V134 = c("", "", "", "", "applied", ""), V135 = c("",
"", "", "", "upon", ""), V136 = c("", "", "", "", "checking",
""), V137 = c("", "", "", "", "in", ""), V138 = c("", "",
"", "", "to", ""), V139 = c("", "", "", "", "the", ""), V140 = c("",
"", "", "", "restaurant.", ""), V141 = c("", "", "", "",
"The", ""), V142 = c("", "", "", "", "person", ""), V143 = c("",
"", "", "", "serving", ""), V144 = c("", "", "", "", "us",
""), V145 = c("", "", "", "", "was", ""), V146 = c("", "",
"", "", "very", ""), V147 = c("", "", "", "", "rude", ""),
V148 = c("", "", "", "", "and", ""), V149 = c("", "", "",
"", "didn\\'t", ""), V150 = c("", "", "", "", "acknowledge",
""), V151 = c("", "", "", "", "the", ""), V152 = c("", "",
"", "", "coupon.", ""), V153 = c("", "", "", "", "When",
""), V154 = c("", "", "", "", "I", ""), V155 = c("", "",
"", "", "asked", ""), V156 = c("", "", "", "", "her", ""),
V157 = c("", "", "", "", "about", ""), V158 = c("", "", "",
"", "it,", ""), V159 = c("", "", "", "", "she", ""), V160 = c("",
"", "", "", "rudely", ""), V161 = c("", "", "", "", "replied",
""), V162 = c("", "", "", "", "back", ""), V163 = c("", "",
"", "", "saying", ""), V164 = c("", "", "", "", "she", ""
), V165 = c("", "", "", "", "had", ""), V166 = c("", "",
"", "", "already", ""), V167 = c("", "", "", "", "applied",
""), V168 = c("", "", "", "", "it.", ""), V169 = c("", "",
"", "", "Then", ""), V170 = c("", "", "", "", "I", ""), V171 = c("",
"", "", "", "inquired", ""), V172 = c("", "", "", "", "about",
""), V173 = c("", "", "", "", "the", ""), V174 = c("", "",
"", "", "free", ""), V175 = c("", "", "", "", "salad", ""
), V176 = c("", "", "", "", "that", ""), V177 = c("", "",
"", "", "they", ""), V178 = c("", "", "", "", "serve.", ""
), V179 = c("", "", "", "", "She", ""), V180 = c("", "",
"", "", "rudely", ""), V181 = c("", "", "", "", "said", ""
), V182 = c("", "", "", "", "that", ""), V183 = c("", "",
"", "", "you", ""), V184 = c("", "", "", "", "have", ""),
V185 = c("", "", "", "", "to", ""), V186 = c("", "", "",
"", "order", ""), V187 = c("", "", "", "", "the", ""), V188 = c("",
"", "", "", "main", ""), V189 = c("", "", "", "", "course",
""), V190 = c("", "", "", "", "to", ""), V191 = c("", "",
"", "", "get", ""), V192 = c("", "", "", "", "that.", ""),
V193 = c("", "", "", "", "Overall,", ""), V194 = c("", "",
"", "", "I", ""), V195 = c("", "", "", "", "had", ""), V196 = c("",
"", "", "", "a", ""), V197 = c("", "", "", "", "bad", ""),
V198 = c("", "", "", "", "experience", ""), V199 = c("",
"", "", "", "as", ""), V200 = c("", "", "", "", "I", ""),
V201 = c("", "", "", "", "had", ""), V202 = c("", "", "",
"", "taken", ""), V203 = c("", "", "", "", "my", ""), V204 = c("",
"", "", "", "family", ""), V205 = c("", "", "", "", "to",
""), V206 = c("", "", "", "", "that", ""), V207 = c("", "",
"", "", "restaurant", ""), V208 = c("", "", "", "", "for",
""), V209 = c("", "", "", "", "the", ""), V210 = c("", "",
"", "", "first", ""), V211 = c("", "", "", "", "time", ""
), V212 = c("", "", "", "", "and", ""), V213 = c("", "",
"", "", "I", ""), V214 = c("", "", "", "", "had", ""), V215 = c("",
"", "", "", "high", ""), V216 = c("", "", "", "", "hopes",
""), V217 = c("", "", "", "", "from", ""), V218 = c("", "",
"", "", "the", ""), V219 = c("", "", "", "", "restaurant",
""), V220 = c("", "", "", "", "which", ""), V221 = c("",
"", "", "", "is,", ""), V222 = c("", "", "", "", "otherwise,",
""), V223 = c("", "", "", "", "my", ""), V224 = c("", "",
"", "", "favorite", ""), V225 = c("", "", "", "", "place",
""), V226 = c("", "", "", "", "to", ""), V227 = c("", "",
"", "", "dine.", ""), V228 = c("", "", "", "", "\nf,n,",
""), V229 = c("", "", "", "", "I", ""), V230 = c("", "",
"", "", "went", ""), V231 = c("", "", "", "", "to", ""),
V232 = c("", "", "", "", "ABC", ""), V233 = c("", "", "",
"", "restaurant", ""), V234 = c("", "", "", "", "two", ""
), V235 = c("", "", "", "", "days", ""), V236 = c("", "",
"", "", "ago", ""), V237 = c("", "", "", "", "and", ""),
V238 = c("", "", "", "", "I", ""), V239 = c("", "", "", "",
"hated", ""), V240 = c("", "", "", "", "the", ""), V241 = c("",
"", "", "", "food", ""), V242 = c("", "", "", "", "and",
""), V243 = c("", "", "", "", "the", ""), V244 = c("", "",
"", "", "service.", ""), V245 = c("", "", "", "", "We", ""
), V246 = c("", "", "", "", "were", ""), V247 = c("", "",
"", "", "kept", ""), V248 = c("", "", "", "", "waiting",
""), V249 = c("", "", "", "", "for", ""), V250 = c("", "",
"", "", "over", ""), V251 = c("", "", "", "", "an", ""),
V252 = c("", "", "", "", "hour", ""), V253 = c("", "", "",
"", "just", ""), V254 = c("", "", "", "", "to", ""), V255 = c("",
"", "", "", "get", ""), V256 = c("", "", "", "", "seated",
""), V257 = c("", "", "", "", "and", ""), V258 = c("", "",
"", "", "once", ""), V259 = c("", "", "", "", "we", ""),
V260 = c("", "", "", "", "ordered,", ""), V261 = c("", "",
"", "", "our", ""), V262 = c("", "", "", "", "food", ""),
V263 = c("", "", "", "", "came", ""), V264 = c("", "", "",
"", "out", ""), V265 = c("", "", "", "", "cold.", ""), V266 = c("",
"", "", "", "I", ""), V267 = c("", "", "", "", "ordered",
""), V268 = c("", "", "", "", "the", ""), V269 = c("", "",
"", "", "pasta", ""), V270 = c("", "", "", "", "and", ""),
V271 = c("", "", "", "", "it", ""), V272 = c("", "", "",
"", "was", ""), V273 = c("", "", "", "", "terrible", ""),
V274 = c("", "", "", "", "-", ""), V275 = c("", "", "", "",
"completely", ""), V276 = c("", "", "", "", "bland", ""),
V277 = c("", "", "", "", "and", ""), V278 = c("", "", "",
"", "very", ""), V279 = c("", "", "", "", "unappatizing.",
""), V280 = c("", "", "", "", "I", ""), V281 = c("", "",
"", "", "definitely", ""), V282 = c("", "", "", "", "would",
""), V283 = c("", "", "", "", "not", ""), V284 = c("", "",
"", "", "recommend", ""), V285 = c("", "", "", "", "going",
""), V286 = c("", "", "", "", "there,", ""), V287 = c("",
"", "", "", "especially", ""), V288 = c("", "", "", "", "if",
""), V289 = c("", "", "", "", "you\\'re", ""), V290 = c("",
"", "", "", "in", ""), V291 = c("", "", "", "", "a", ""),
V292 = c("", "", "", "", "hurry!'", "")), .Names = c("V1",
"V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11",
"V12", "V13", "V14", "V15", "V16", "V17", "V18", "V19", "V20",
"V21", "V22", "V23", "V24", "V25", "V26", "V27", "V28", "V29",
"V30", "V31", "V32", "V33", "V34", "V35", "V36", "V37", "V38",
"V39", "V40", "V41", "V42", "V43", "V44", "V45", "V46", "V47",
"V48", "V49", "V50", "V51", "V52", "V53", "V54", "V55", "V56",
"V57", "V58", "V59", "V60", "V61", "V62", "V63", "V64", "V65",
"V66", "V67", "V68", "V69", "V70", "V71", "V72", "V73", "V74",
"V75", "V76", "V77", "V78", "V79", "V80", "V81", "V82", "V83",
"V84", "V85", "V86", "V87", "V88", "V89", "V90", "V91", "V92",
"V93", "V94", "V95", "V96", "V97", "V98", "V99", "V100", "V101",
"V102", "V103", "V104", "V105", "V106", "V107", "V108", "V109",
"V110", "V111", "V112", "V113", "V114", "V115", "V116", "V117",
"V118", "V119", "V120", "V121", "V122", "V123", "V124", "V125",
"V126", "V127", "V128", "V129", "V130", "V131", "V132", "V133",
"V134", "V135", "V136", "V137", "V138", "V139", "V140", "V141",
"V142", "V143", "V144", "V145", "V146", "V147", "V148", "V149",
"V150", "V151", "V152", "V153", "V154", "V155", "V156", "V157",
"V158", "V159", "V160", "V161", "V162", "V163", "V164", "V165",
"V166", "V167", "V168", "V169", "V170", "V171", "V172", "V173",
"V174", "V175", "V176", "V177", "V178", "V179", "V180", "V181",
"V182", "V183", "V184", "V185", "V186", "V187", "V188", "V189",
"V190", "V191", "V192", "V193", "V194", "V195", "V196", "V197",
"V198", "V199", "V200", "V201", "V202", "V203", "V204", "V205",
"V206", "V207", "V208", "V209", "V210", "V211", "V212", "V213",
"V214", "V215", "V216", "V217", "V218", "V219", "V220", "V221",
"V222", "V223", "V224", "V225", "V226", "V227", "V228", "V229",
"V230", "V231", "V232", "V233", "V234", "V235", "V236", "V237",
"V238", "V239", "V240", "V241", "V242", "V243", "V244", "V245",
"V246", "V247", "V248", "V249", "V250", "V251", "V252", "V253",
"V254", "V255", "V256", "V257", "V258", "V259", "V260", "V261",
"V262", "V263", "V264", "V265", "V266", "V267", "V268", "V269",
"V270", "V271", "V272", "V273", "V274", "V275", "V276", "V277",
"V278", "V279", "V280", "V281", "V282", "V283", "V284", "V285",
"V286", "V287", "V288", "V289", "V290", "V291", "V292"), row.names = c(NA,
6L), class = "data.frame")
Dataset:
lie sentiment review
f n 'Mike\'s Pizza High Point NY Service was very slow and the quality was low. You would think they would know at least how to make good pizza not. Stick to pre-made dishes like stuffed pasta or a salad. You should consider dining else where.'
f n 'i really like this buffet restaurant in Marshall street. they have a lot of selection of american japanese and chinese dishes. we also got a free drink and free refill. there are also different kinds of dessert. the staff is very friendly. it is also quite cheap compared with the other restaurant in syracuse area. i will definitely coming back here.'
f n 'After I went shopping with some of my friend we went to DODO restaurant for dinner. I found worm in one of the dishes .'
f n 'Olive Oil Garden was very disappointing. I expect good food and good service (at least!!) when I go out to eat. The meal was cold when we got it and the waitor had no manners whatsoever. Don\'t go to the Olive Oil Garden. '
f n 'The Seven Heaven restaurant was never known for a superior service but what we experienced last week was a disaster. The waiter would not notice us until we asked him 4 times to bring us the menu. The food was not exceptional either. It took them though 2 minutes to bring us a check after they spotted we finished eating and are not ordering more. Well never more. '
f n 'I went to XYZ restaurant and had a terrible experience. I had a YELP Free Appetizer coupon which could be applied upon checking in to the restaurant. The person serving us was very rude and didn\'t acknowledge the coupon. When I asked her about it she rudely replied back saying she had already applied it. Then I inquired about the free salad that they serve. She rudely said that you have to order the main course to get that. Overall I had a bad experience as I had taken my family to that restaurant for the first time and I had high hopes from the restaurant which is otherwise my favorite place to dine. '
f n 'I went to ABC restaurant two days ago and I hated the food and the service. We were kept waiting for over an hour just to get seated and once we ordered our food came out cold. I ordered the pasta and it was terrible - completely bland and very unappatizing. I definitely would not recommend going there especially if you\'re in a hurry!'
f n 'I went to the Chilis on Erie Blvd and had the worst meal of my life. We arrived and waited 5 minutes for a hostess and then were seated by a waiter who was obviously in a terrible mood. We order drinks and it took them 15 minutes to bring us both the wrong beers which were barely cold. Then we order an appetizer and wait 25 minutes for cold southwest egg rolls at which point we just paid and left. Don\'t go.'
f n 'OMG. This restaurant is horrible. The receptionist did not greet us we just stood there and waited for five minutes. The food came late and served not warm. Me and my pet ordered a bowl of salad and a cheese pizza. The salad was not fresh the crust of a pizza was so hard like plastics. My dog didn\'t even eat that pizza. I hate this place!!!!!!!!!!'
Thanks in advance,
I don't know why you removed the file from the original post, #Yes Boss but this answer is based on this file, rather than your dput output. The file basically had two problems why you couldn't read it in. 1. Your quote character was ' instead of the more common "; 2. ' is also used in the column review which is a bit too much for base (it tries to split into new columns in these instances). Luckily, the package data.table is a bit smarter and can take care of problem #2:
library(data.table)
df <- fread(file = "deception.csv", quote="\'")
The resulting object will be a data.table instead of a data.frame:
> str(df)
Classes ‘data.table’ and 'data.frame': 92 obs. of 3 variables:
$ lie : chr "f" "f" "f" "f" ...
$ sentiment: chr "n" "n" "n" "n" ...
$ review : chr "Mike\\'s Pizza High Point, NY Service was very slow and the quality was low. You would think they would know at"| __truncated__ "i really like this buffet restaurant in Marshall street. they have a lot of selection of american, japanese, an"| __truncated__ "After I went shopping with some of my friend, we went to DODO restaurant for dinner. I found worm in one of the dishes ." "Olive Oil Garden was very disappointing. I expect good food and good service (at least!!) when I go out to eat."| __truncated__ ...
- attr(*, ".internal.selfref")=<externalptr>
You can turn this behaviour off by setting data.table = FALSE in fread() (if you want to, I recommend learning how to work with data.table).
A personal opinionated note: If you want to get into text mining, look into the quanteda package instead of tm. It is a lot faster and has a more modern approach to many tasks.
For this particular text file, you need to look at the quote argument. In read.table(), the default quote argument is either a single or double quote. Here you need to make it just a single quote:
df <- read.table("filename", header = TRUE, quote = "\'")
str(df)
# 'data.frame': 9 obs. of 3 variables:
# $ lie : Factor w/ 1 level "f": 1 1 1 1 1 1 1 1 1
# $ sentiment: Factor w/ 1 level "n": 1 1 1 1 1 1 1 1 1
# $ review : Factor w/ 9 levels "After I went shopping with some of my friend we went to DODO restaurant for dinner. I found worm in one of the dishes .",..: 6 2 1 7 9 5 3 4 8
That should do it for you.
I'd recommend reading the help file for read.table() (all the way through). There's a lot to consider.

Resources