# Load necessary libraries
library(data.table)
library(Matrix)
library(qvalue)
library(PCAForQTL)

# Load expression data and set row names
expression_data <- read.table("F:/TWAS-TWAS/TPM.txt", header = TRUE, row.names = 1) # Row = Molecular Phenotypes; Column = Individuals
expression_data <- t(log2(expression_data + 1))  # Log2 transformation and transpose
expression_data <- as.data.frame(expression_data)

# Load phenotype data and covariates information
phenotype_data <- read.table("F:/TWAS-TWAS/Phenotype_Liver.txt", header = TRUE, row.names = 1)
sex_data <- read.table("F:/TWAS-TWAS/Sex_Info.txt", header = TRUE, row.names = 1)

# Check data consistency
if (nrow(expression_data) != nrow(phenotype_data)) {
  stop("Mismatch in the number of samples between expression and phenotype data")
}

# Ensure consistent order of individuals across datasets
expression_data <- expression_data[order(rownames(expression_data)), ]
phenotype_data <- phenotype_data[order(rownames(phenotype_data)), ]
sex_data <- sex_data[order(rownames(sex_data)), ]

# Perform Principal Component Analysis (PCA)
prcompResult <- prcomp(expression_data, center = FALSE, scale. = FALSE)
PCs <- prcompResult$x

# Run the elbow method to determine the optimal number of PCs
n_pcs <- PCAForQTL::runElbow(prcompResult = prcompResult)
significant_pcs <- PCs[, 1:n_pcs]

# Prepare data for regression
X <- as.matrix(expression_data)  # Expression matrix
G <- cbind(significant_pcs, sex_data)  # Combine significant PCs with covariates data

# Initialize a list to store results for all phenotypes
all_cor_results <- list()

# Loop over each phenotype column to perform regression analysis
for (phenotype_col in colnames(phenotype_data)) {
  Y <- phenotype_data[[phenotype_col]]  # Current phenotype data
  
  # Calculate correlations and p-values for each gene
  cor_results <- apply(X, 2, function(gene_expr) {
    lm_model <- try(lm(Y ~ gene_expr + G), silent = TRUE)  # Run linear regression model
    if (class(lm_model) == "try-error") {
      cat("Model did not converge for gene:", gene_expr, "\n")
      return(c(NA, NA))  # Return NA if model fails to converge
    } else {
      model_summary <- summary(lm_model)
      estimate <- model_summary$coefficients[2, "Estimate"]
      p_value <- model_summary$coefficients[2, "Pr(>|t|)"]
      return(c(estimate, p_value))
    }
  })
  
  # Convert results to a data frame and add column names
  cor_results <- t(cor_results)
  colnames(cor_results) <- c(paste0("Estimate_", phenotype_col), paste0("P.value_", phenotype_col))
  
  # Perform FDR correction
  cor_results <- as.data.frame(cor_results)
  cor_results[[paste0("FDR_", phenotype_col)]] <- p.adjust(cor_results[[paste0("P.value_", phenotype_col)]], method = "fdr")
  
  # Store the results for the current phenotype
  all_cor_results[[phenotype_col]] <- cor_results
}

# Combine results from all phenotypes into a single data frame
final_results <- do.call(cbind, all_cor_results)

# Write the results to an output file
write.table(final_results, file = "F:/TWAS-TWAS/TEST.txt", quote = FALSE, sep = "\t", row.names = TRUE)

# Print the first few rows of the final results
head(final_results)
