Genómica con R

Autor/a

Francisco Ascue Orosco

Introducción

En este tutorial, aprenderemos a configurar un entorno de trabajo en R para realizar análisis de bioinformática genómica. Cubriremos la instalación de las bibliotecas necesarias y cómo cargarlas en R. A lo largo del tutorial, utilizaremos el código proporcionado en el archivo main.R.

Instalación y carga de bibliotecas

Comenzamos instalando y cargando las bibliotecas requeridas. Asegúrate de que R esté configurado para instalar paquetes si aún no tienes estas bibliotecas instaladas. Ejecuta el siguiente código en tu consola de R:

source("install.R", local = TRUE)

install.R

El archivo install.R contiene un script para instalar las bibliotecas necesarias. A continuación, se muestra el contenido de install.R:

# Package R dependencies
if (!is.element("devtools", rownames(installed.packages()))) {
  install.packages("devtools")
  install.packages("BiocManager")
}

library(devtools)

# Install missing packages
missingPackages <- function(pkg) {
  if (!is.element(pkg, rownames(installed.packages()))) {
    message(pkg, "-----> Package is not installed ")
    BiocManager::install(pkg)
  }
}

dependencies <- c(
  "dada2", "Rbowtie2", "Rsamtools", "ape", "rjson",
  "dplyr", "foreach", "doParallel", "tidyr", "Biostrings",
  "gmapR", "fastqcr", "ShortRead"
)

for (i in dependencies) {
  missingPackages(i)
  library(i, character.only = TRUE)
}

if (!is.element("GEOquery", rownames(installed.packages()))) {
  devtools::install_github('GEOquery', 'seandavi')
}

if (!is.element("radiator", rownames(installed.packages()))) {
  devtools::install_github("thierrygosselin/radiator")
}

library(radiator)

Funciones y archivos Fuente Utilizados

En esta sección, se colocaran las funciones y los archivos fuentes que utilizaremos durante el curso.

Descarga de SRA Toolkit

El siguiente código descarga SRA Toolkit dependiendo del sistema operativo (Windows o Linux) y configura las funciones necesarias para su uso.

scripts/sratoolkit.R

# Código para descargar SRA Toolkit y configurar funciones específicas
# según el sistema operativo (Windows o Linux)
# ...

# Funciones para descargar y dividir archivos SRA
setClass("sratoolkit", slots = list(dest = "character", sraid = "character"))

setGeneric("downloadsra", function(obj) {
  standardGeneric("downloadsra")
})

setGeneric("fastqdump", function(obj, output) {
  standardGeneric("fastqdump")
})

Listado de archivos FASTQ

El archivo listfastq.R contiene una función llamada list_fastq que se utiliza para listar los archivos FASTQ en un directorio específico y organizarlos según un patrón.

scripts/listfastq.R

list_fastq <- function(ruta = getwd(), pattern = c("_1.fastq", "_2.fastq")){
  #Set working directory
  setwd(ruta)
  fastq= paste0(ruta,"/data/rawdata")
  list.files(fastq)
  lecturasf <- sort(list.files(fastq, pattern = pattern[1], full.names = T))
  lecturasr <- sort(list.files(fastq, pattern = pattern[2], full.names = T))
  nombres_muestras <- sapply(strsplit(basename(lecturasf),"_"), `[`,1)
  return(list(lf = lecturasf, lr = lecturasr, name = nombres_muestras))
  
}

Filtrado y recorte de lecturas

El archivo filterreads.R contiene una función llamada filter_reads que se utiliza para filtrar y recortar lecturas de archivos FASTQ.

scripts/filtereads.R


#----filtrado y recorte de lecturas----

filter_reads <- function(name, lf, lr, trunc = 150){
  
  filtF <- file.path(paste0(getwd(),"/data/processed_data"), "filtered_F", 
                     paste0(name, "_filt_1.fastq"))
  filtR <- file.path(paste0(getwd(),"/data/processed_data"), "filtered_R", 
                     paste0(name, "_filt_2.fastq"))
  names(filtF) <- name
  names(filtR) <- name

  out <- filterAndTrim(lf,filtF,lr,filtR, 
                       truncLen = c(trunc,trunc), 
                       maxN = 0,
                       maxEE = c(2,2),
                       truncQ = 2,
                       rm.phix = T,
                       compress = F,
                       multithread = T)
  
  recovery <- round(out[2]/out[1]*100,2)
  log <- paste0("Secuencias recuperadas : ",recovery, "%")
  cat(log)
  return(log)
}

Generación de consenso

El archivo consensus.R contiene una función llamada consensus_parallel que se utiliza para generar secuencias de consenso a partir de datos de secuenciación. La función consensus_parallel toma varios argumentos, incluyendo el rango de posiciones, profundidad, umbral de frecuencia y otros.

scripts/filtereads.R


consensus_parallel <- function(sup, inf=1, ex, depth=0, freq_threshold = 0.6, 
                               mltcore, fastafile = NULL){
  
  nucl_string <-function(num, ex,depth,freq_threshold ){
    ## exist position 
    if(nrow(ex[which(ex$pos == num), ]) == 0){
      return <- "N"
    }else{
      tmp <- ex[which(ex$pos ==num), ]
      ## depth >= threshold
      if(sum(tmp$count) >= depth){
        tmp <- tmp %>% dplyr::mutate(Percentage = count/sum(count))
        ### most frequency nucl >= threshold 
        if(max(tmp$Percentage) >= freq_threshold){
          tmp <- as.character(tmp[which(tmp$Percentage == max(tmp$Percentage)),3])
          return <- tmp
          ### most frequency nucl < threshold 
        }else{
          
          ## capture two more frequency nucl 
          abg <- as.character(tmp[which(tmp$Percentage >= freq_threshold/2),3])
          abg_rev <- rev(abg)
          ## make a string normal and rev
          abg <-  paste(abg, collapse = "")
          abg_rev <- paste(abg_rev, collapse = "")
          
          ## identify IUPAC code 
          result = switch(  
            abg,  
            "AG"= "R",  
            "CT"= "Y",  
            "GC"= "S",  
            "AT"= "W",  
            "GT"= "K",  
            "AC"= "M"
          )  
          
          result_rev = switch(  
            abg_rev,  
            "AG"= "R",  
            "CT"= "Y",  
            "GC"= "S",  
            "AT"= "W",  
            "GT"= "K",  
            "AC"= "M"
          )  
          
          ## assign code to vector consensus
          if(is.null(result_rev) && is.null(result)){
            return <-  "N"
          }else{
            r <- c(result,result_rev)
            return <-  r
          }
          
        }
        
        ## depth <= threshold   
      }else{
        return <-  "N"
      }
    }
  }
  
  # dna_vector <- c()
  
  my.cluster <- parallel::makeCluster(
    mltcore, 
    type = "PSOCK"
  )
  
  doParallel::registerDoParallel(cl = my.cluster)
  foreach::getDoParRegistered()
  foreach::getDoParWorkers()
  
  x <- foreach(i = 1:sup, .combine = 'c',.packages='dplyr')%dopar%{
    nucl_string(num = i, ex = ex,depth = depth,
                freq_threshold = freq_threshold )
  }
  
  #make string of vector consensus
  dna_string <- paste(x, collapse = "")
  parallel::stopCluster(cl = my.cluster)
  
  if(!is.null(fastafile)){
    fileConn<-file(fastafile)
    writeLines(c(paste0(">",as.character(res$seqnames[1]),
                        "_",format(inf,scientific=FALSE),
                        ":",format(sup,scientific=FALSE)),
                 dna_string), fileConn)
    close(fileConn)
  }
  return(x)
}

Volver arriba