lundi 26 juillet 2021

Summarise & Aggregate: I can´t understand why I get this error, if statement doesn't work

I am trying to replicate the way SAS calculates means with PROC SURVEYMEANS, for this, SAS treats numerical variables in a different way than categorical variables. The means of the numerical variables are weighted means and the means of the categorical variables are made with proportions of the value of the weights.

I have managed to get the same results as in SAS for both types of variables. But for a domain type (together the data according to certain types of variables) I get an error for categorical variables when the number of observations is high.

In the complete data, there are many variables with many NAs, at no time it gives me problems, but at that level of aggregation, I think NAs are the problem since very small groups are created (sometimes of 1 individual) and I think that I can't get around the error of dividing by 0 for some reason. (I am really speculating because I have not been able to find the error and in other domains I also have entire NA groups and they do not give problems)

Here I show a small example of what my code does when it works well. I'm going to omit the numeric variables:

INPUT:

CODE   TIPO   P1G  P3  Weights
--------------------------------
3441  Grado_H  3   0    1.2
3441  Grado_H  2   1    1.2
3431  Grado_H  2   1    2 
3431  Grado_H  5   1    2
3465  Grado_J  3   0    1.9
3465  Grado_J  NA  1    1.9
4061  Grado_X  8   0    1.43

OUTPUT: An excel file with as many sheets as domains I am evaluating, in each of the sheets, I concatenate the results of all the variables in that domain:

              SHEET1: CODE                                          SHEET2: TIPO
CODE    Var VarLevel  n     Mean  TIPE_VAR          TIPO    Var VarLevel    n   Mean    TIPE_VAR
------------------------------------------         ---------------------------------------------
3431    P1G    2      1     0.5     CAT            Grado_H  P1G    2        2   0.5       CAT
3431    P1G    5      1     0.5     CAT            Grado_H  P1G    3        1   0.1875    CAT
3441    P1G    2      1     0.5     CAT            Grado_H  P1G    5        1   0.3125    CAT
3441    P1G    3      1     0.5     CAT            Grado_J  P1G    3        1   1         CAT
3465    P1G    3      1     1       CAT            Grado_X  P1G    8        1   1         CAT
4061    P1G    8      1     1       CAT            Grado_H  P3     2        1   0.5       CAT
3431    P3     2      2     0.5     CAT            Grado_H  P3     3        3   0.1875    CAT
3431    P3     5      1     0.5     CAT            Grado_H  P3     5        1   0.3125    CAT
3441    P3     2      1     0.5     CAT            Grado_J  P3     3        1   1         CAT
3441    P3     3      1     0.5     CAT            Grado_X  P3     8        1   1         CAT
3465    P3     3      1     1       CAT            Grado_H  P3     0        1   0.1875    CAT
4061    P3     8      1     1       CAT            Grado_H  P3     1        3   0.8125    CAT
3431    P3     1      2     1       CAT            Grado_J  P3     0        1   0.5       CAT
3441    P3     0      1     0.5     CAT            Grado_J  P3     1        1   0.5       CAT
3441    P3     1      1     0.5     CAT            Grado_X  P3     0        1   1         CAT
3465    P3     0      1     0.5     CAT                             
3465    P3     1      1     0.5     CAT                             
4061    P3     0      1     1       CAT

And this is the code:

datos<-structure(list(CODE = c(3441, 3441, 3431, 3431, 3465, 3465, 4061), 
                      TIPO = c("Grado_H", "Grado_H", "Grado_H", "Grado_H", "Grado_J", "Grado_J", "Grado_X"), 
                      P1G = c(3, 2, 2, 5, 3, NA, 8), 
                      P3 = c(0, 1, 1, 1, 0, 1, 0), 
                      Pesos = c(1.2, 1.2, 2, 2, 1.9, 1.9, 1.43)), 
                      row.names = c(NA, -7L), class = c("tbl_df", "tbl","data.frame"))
domain<-c("CODE","TIPO")
preg_c<-c("P1G","P3")
library(readxl)
library(dplyr)

wb <- createWorkbook()
for(k in 1:(length(domain))){
  data_c<-subset(datos[!(is.na(datos[[domain[k]]])),],select=c(domain[[k]],preg_c,"Pesos"))
  
  #===Categorical Means====
  DatosTotal <- data.frame()
  for(j in 1:(length(preg_c))){
    final2<-summarise(group_by(data_c[!(is.na(data_c[[preg_c[j]]])),], data_c[!(is.na(data_c[[preg_c[j]]])),][[domain[[k]]]],data_c[!(is.na(data_c[[preg_c[j]]])),][[preg_c[j]]]),n = n())
    for(i in 1:length(unique(data_c[[domain[[k]]]]))){
      datosH<-data_c[data_c[[domain[[k]]]]==unique(sort(data_c[[domain[[k]]]]))[i],]
      sumas<- aggregate(x = datosH$Pesos,by = list(datosH[[preg_c[j]]]),FUN = sum) 
      suma_total<-sum(datosH[!is.na(datosH[[preg_c[j]]]),]$Pesos)
      if(suma_total==0){break}
      if(exists("datosH2")){
        datosH3<-unique(sort(datosH[[preg_c[j]]]))
        datosH_Means<-cbind(sumas$x/suma_total)
        datosH3<-cbind(unique(sort(data_c[[domain[[k]]]]))[i],datosH3,datosH_Means)
        datosH2<-rbind(datosH2,datosH3)
      }else{
        datosH2<-c()
        datosH2<-unique(sort(datosH[[preg_c[j]]]))
        datosH_Means<-cbind(sumas$x/suma_total)
        datosH2<-cbind(unique(sort(data_c[[domain[[k]]]]))[i],datosH2,datosH_Means)}
    }
    categorica<-as.data.frame(cbind(datosH2[,1],preg_c[j],datosH2[,2],final2$n,datosH2[,3],"CAT"))
    colnames(categorica)<-c(domain[[k]],"Var","VarLevel","n","Mean","TIPE_VAR")
    DatosTotal<-rbind(DatosTotal,categorica)

  }
  rm(final2,datosH,sumas,suma_total,datosH3,datosH_Means,categorica,datosH2)

  addWorksheet(wb, domain[[k]])
  writeDataTable(wb, domain[[k]], DatosTotal)
  freezePane(wb,domain[[k]], firstRow = TRUE)
  setColWidths(wb,domain[[k]],cols = 1:ncol(DatosTotal) ,widths = "auto")
}
saveWorkbook(wb, file = "C:/Users/Cris/Desktop/Practicas/Output.xlsx",overwrite = TRUE)

As you can see, it works, but if I try with this piece of the real dataset I get the following error:

datos <- structure(list(CODE = c(3491, 3491, 3491, 3491, 3741,4161, 4051, 3741, 4141, 3471, 3824, 4271, 3603, 3471, 3491, 3603,3473, 3603, 3464, 3491, 3771, 3464, 4301, 3464, 3474, 3463, 3381,3473, 3464, 3603, 3475, 4563, 4563, 4563, 4563, 4563, 4563, 4563,4563, 3474), TIPO = c("MASTER_J", "MASTER_J", "MASTER_J","MASTER_J", "GRADO_J", "MASTER_J", "GRADO_X", "GRADO_J", "MASTER_J","GRADO_J", "GRADO_J", "MASTER_S", "GRADO_J", "GRADO_J", "MASTER_J","GRADO_J", "GRADO_J", "GRADO_J", "GRADO_J", "MASTER_J", "GRADO_T","GRADO_J", "MASTER_J", "GRADO_J", "GRADO_J", "GRADO_J", "MASTER_S","GRADO_J", "GRADO_J", "GRADO_J", "GRADO_J", "MASTER_J", "MASTER_J","MASTER_J", "MASTER_J", "MASTER_J", "MASTER_J", "MASTER_J", "MASTER_J","GRADO_J"), P1G = c(NA, NA, NA, NA, "7", NA,"1", "2", NA, "2", "3", NA, "3", "2", NA, "3", "10", "3", "3",NA, "1", "3", NA, "3", "8", "1", NA, "3", "8", "3", "2", NA,NA, NA, NA, NA, NA, NA, NA, "3"), P3 = c(11, 2, 2, 2, 1, 1, 2,2, 2, 6, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2,2, 2, 2, 2, 2, 6, 6, 2, 2, 2, 2, 2), Pesos = c(1.6, 1.6, 1.6,1.6, 1.57142857142857, 1.33333333333333, 1.54285714285714, 1.57142857142857,1.6, 1.42201834862385, 1.5, 1.27272727272727, 1.48387096774194,1.42201834862385, 1.6, 1.48387096774194, 1.73469387755102, 1.48387096774194,1.22916666666667, 1.6, 1.4, 1.22916666666667, 1.73076923076923,1.22916666666667, 1.37837837837838, 1.62068965517241, 1.21428571428571,1.73469387755102, 1.22916666666667, 1.48387096774194, 1.63157894736842,4.25, 4.25, 4.25, 4.25, 4.25, 4.25, 4.25, 4.25, 1.37837837837838)), row.names = c(NA, -40L), class = c("tbl_df", "tbl", "data.frame")) 

Error in cbind(datosH2[, 1], preg_c[j], datosH2[, 2], final2$n, datosH2[, : object 'datosH2' not found

My code is perhaps very little compact and there is a way to do the same in a more efficient way so that this error does not appear. What could be happening? It's like if the "if" with the "break" doesn't work, I would appreciate if someone could explain to me what I have done wrong.

Aucun commentaire:

Enregistrer un commentaire