Data Wrangling Project

1 minute read

Data structure

GitHub

Hieracrchical Community Network(hiconet) is software for integration of multiple data types collected from a common group of subjects. We aim to construct data including data matrix and annotation that can be applied in hiconnet, which will serve as reference for user.

Data Matrix

A data matrix of continuous values that represent a biological state or concentration, of the same data type, including different time points or treatments.

Annotation

  • Observation Annotation: meta data on samples.
  • Feature Annotation: meta data on features

Key annotation variables

time point and treatment: value are summarized on sample group by key annotation variables

Graph

graph/network for relationships in the data (e.g. used in loom format, loompy.org).

Data Wrangling

  1. Connect to ImmPort using ImmuneSpaceR
names <- c("SDY80","SDY180","SDY212","SDY269","SDY312")
flist <- list()

for (ii in 1:length(names)) {
  tmp <- CreateConnection(names[ii])
  flist[[ii]] <- tmp$getDataset("fcs_analyzed_result")
}
  1. Write out the input files
mainDir <- "~/Downloads"
subDir <- out

dir.create(file.path(mainDir, subDir))
setwd(file.path(mainDir, subDir))

for (ii in 1:length(flist)) {
 # Write out the files

  outname <- paste(names[ii],key,".tsv",sep="_")

  if (nrow(flist[[ii]]) == 0) {
    str <- paste(outname,"does not have any rows to write",sep=" ")
    print(str)
  } else {
     write_tsv(flist[[ii]],path=outname)
  }
}
  1. Generate data matrix

The col1 is columns we group by which differs at different datasets

the col2 is the column we wish to summarize, which differs too.

setwd(file.path(mainDir, subDir))

files <- list.files(pattern=paste0("*",key,"*"))

for (ii in 1:length(files)) {
  fcs <- read_tsv(files[ii])
  ids <- sapply(strsplit(fcs$`participant_id`,"\\."),`[`,1)
  observation_ID <- gsub(" ","",paste(ids,format(fcs$`study_time_collected`,nsmall=1),sep="_"))

# All we need is the first 10 columns from the data frame

   fcs.mat <- cbind(observation_ID,fcs)
   fcs.mat$observation_ID <- as.character(fcs.mat$observation_ID)
   fcs.mat$`Participant ID` <- ids
   data.matrix <- fcs.mat %>%
     group_by(UQ(as.name(col1)),observation_ID) %>%#`population_definition_reported`
     summarize(med=median(UQ(as.name(col2)))) %>% #`population_cell_number`
     spread(observation_ID,med)

    fname <- paste(strsplit(files[ii],"_")[[1]][1],"fcs_data_matrix.tsv",sep="_")
    write_tsv(data.matrix,fname)
}
  1. Generate the Annotation file

limit the number of columns we need here.

remove any duplicates since this is simply an annotation file.


setwd(file.path(mainDir, subDir))
files <- list.files(pattern=paste0("*",key,"*"))
#
for (ii in 1:length(files)) {
  fcs <- read_tsv(files[ii])
  ids <- sapply(strsplit(fcs$`participant_id`,"\\."),`[`,1)
  observation_ID <- gsub(" ","",paste(ids,format(fcs$`study_time_collected`,nsmall=1),sep="_"))

   fcs.tmp <- cbind(observation_ID,fcs)
   fcs.tmp$observation_ID <- as.character(fcs.tmp$observation_ID)
   fcs.tmp$`Participant ID` <- ids

      fcs.2 <- fcs.tmp[,1:8]

# Are there duplicates ?
      sum(duplicated(fcs.2))

# Get rid of duplicated rows (except the first occurrence thereof)
    fcs.3 <- fcs.2[!duplicated(fcs.2),]

    fname <- paste(strsplit(files[ii],"_")[[1]][1],key,"annotation.tsv",sep="_")
    write_tsv(fcs.3,fname)
}