library(coloc)
library(dplyr)
library(data.table)

#Argument parsing
args <- commandArgs(trailingOnly = TRUE)
if (length(args) != 4) {
  stop("Please provide GWAS file path, eQTL folder path, output CSV file path, and tissue name")
}

gwas_file <- args[1]       # Path to GWAS summary statistics file
eqtl_dir <- args[2]        # Path to the folder containing eQTL data
output_file <- args[3]     # Path to output summary result CSV file
tissue <- args[4]          # Tissue name

#Load GWAS data
gwas_df <- read.table(gwas_file, header = TRUE)

#Identify chromosomes present in GWAS data
chromosomes <- unique(gwas_df$chr)

# Preprocess GWAS data for coloc analysis (The traits were standardized with mean = 0 and SE = 1)
gwas_for_coloc <- gwas_df %>%
  mutate(snp = snp_id,
         beta = beta,
         var = se^2,
         sdY = 1,
         p = p_value,
         N = 300)

#Initialize result containers
results_list <- list()      
snp_results_list <- list() 

#Loop through chromosomes to read and analyze corresponding molQTL files
for (chr in chromosomes) {
  eqtl_file_path <- sprintf("%s/%s.cis_qtl_pairs.%s.csv", eqtl_dir, tissue, chr)
  if (file.exists(eqtl_file_path)) {
    eqtl_df <- fread(eqtl_file_path)
    
    # Split eQTL data by gene
    list_of_eqtl_dfs <- split(eqtl_df, eqtl_df$phenotype_id)
    
    # Perform colocalization for each gene
    for (gene in names(list_of_eqtl_dfs)) {
      eqtl_sub_df <- list_of_eqtl_dfs[[gene]]
      eqtl_for_coloc <- eqtl_sub_df %>%
        mutate(snp = variant_id,
               beta = slope,
               var = slope_se^2,
               sdY = 1,
               p = pval_nominal,
               N = 300)
      
      # Identify common SNPs between eQTL and GWAS
      common_snps <- intersect(eqtl_for_coloc$snp, gwas_for_coloc$snp)
      
      if (length(common_snps) > 0) {
        eqtl_for_coloc <- eqtl_for_coloc %>% filter(snp %in% common_snps)
        gwas_for_coloc_filtered <- gwas_for_coloc %>% filter(snp %in% common_snps)
        
        # Run coloc.abf
        coloc_result <- coloc.abf(
          dataset1 = list(beta = eqtl_for_coloc$beta,
                          snp = eqtl_for_coloc$snp,
                          varbeta = eqtl_for_coloc$var,
                          p = eqtl_for_coloc$p,
                          N = eqtl_for_coloc$N,
                          sdY = eqtl_for_coloc$sdY,
                          type = "quant"),
          dataset2 = list(beta = gwas_for_coloc_filtered$beta,
                          snp = gwas_for_coloc_filtered$snp,
                          varbeta = gwas_for_coloc_filtered$var,
                          p = gwas_for_coloc_filtered$p,
                          N = gwas_for_coloc_filtered$N,
                          sdY = gwas_for_coloc_filtered$sdY,
                          type = "quant"),
          MAF = NULL
        )
        
        # Store summary results
        results_list[[gene]] <- coloc_result$summary
        
        # Store SNP-level posterior probabilities with SNP.PP.H4 > 0.4 and add gene info
        snp_results <- coloc_result$results
        snp_results_filtered <- snp_results[snp_results$SNP.PP.H4 > 0.4, ]
        if (nrow(snp_results_filtered) > 0) {
          snp_results_filtered$gene <- gene
          snp_results_list[[gene]] <- snp_results_filtered
        }
      }
    }
  } else {
    warning(sprintf("eQTL file for chromosome %s not found: %s", chr, eqtl_file_path))
  }
}

# Combine summary results and write to CSV
results_df <- data.frame(nsnps = numeric(),
                         gene = character(),
                         PP0 = numeric(),
                         PP1 = numeric(),
                         PP2 = numeric(),
                         PP3 = numeric(),
                         PP4 = numeric(),
                         stringsAsFactors = FALSE)

for (gene in names(results_list)) {
  coloc_res <- results_list[[gene]]
  temp_df <- data.frame(nsnps = coloc_res[1],
                        gene = gene,
                        PP0 = coloc_res[2],
                        PP1 = coloc_res[3],
                        PP2 = coloc_res[4],
                        PP3 = coloc_res[5],
                        PP4 = coloc_res[6])
  results_df <- rbind(results_df, temp_df)
}

write.csv(results_df, output_file, row.names = FALSE)

# Combine SNP-level results and write to a separate files
if (length(snp_results_list) > 0) {
  snp_results_df <- do.call(rbind, snp_results_list)
  snp_output_file <- sub("\\.csv$", "_snp.csv", output_file)
  write.csv(snp_results_df, snp_output_file, row.names = FALSE)
} else {
  warning("No SNPs passed the SNP.PP.H4 > 0.4 threshold.")
}
