library(xgboost)
library(dplyr)
library(ggplot2)
library(caret)

# Set working directory and read data
setwd("F:/")
data <- read.table("input_xgboost.txt", header = TRUE, row.names = 1)

# Data preprocessing
preProcValues <- preProcess(data, method = c("BoxCox", "center", "scale"))
data <- predict(preProcValues, data)
head(data)

# Define the target column
target_column <- "Egg_Production"

# Define XGBoost model parameters
params <- list(
  objective = "reg:squarederror",  # Regression task with squared error loss
  eval_metric = "rmse"             # RMSE as evaluation metric
)

# Split data into training (90%) and test (10%) sets
set.seed(123) 
train_indices <- sample(1:nrow(data), size = floor(0.8 * nrow(data)))
train_data <- data[train_indices, ]
test_data <- data[-train_indices, ]

# Create DMatrix for training and testing
dtrain <- xgb.DMatrix(data = as.matrix(train_data[, !colnames(train_data) %in% target_column]), 
                      label = train_data[, target_column])
dtest <- xgb.DMatrix(data = as.matrix(test_data[, !colnames(test_data) %in% target_column]), 
                     label = test_data[, target_column])

# Train the model
model <- xgboost(params = params, data = dtrain, nrounds = 1000, verbose = 0)

# Make predictions on the test set
predictions <- predict(model, dtest)

# Compute performance metrics
rmse_val <- sqrt(mean((predictions - test_data[, target_column])^2))
mae_val <- mean(abs(predictions - test_data[, target_column]))
pearson_corr_val <- cor(predictions, test_data[, target_column], method = "pearson")

cat("Pearson Correlation: ", pearson_corr_val, "\n")
cat("RMSE: ", rmse_val, "\n")
cat("MAE: ", mae_val, "\n")

# Retrieve and print feature importance
importance_matrix <- xgb.importance(model = model)
print(importance_matrix)
write.table(importance_matrix, "feature_importance.txt", sep = "\t", row.names = FALSE)


gene_importance_plot <- ggplot(importance_matrix, aes(x = reorder(Feature, Gain), y = Gain)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(x = "Gene", y = "Gain", title = "Feature Importance (Gain)") +
  theme_minimal() +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.line = element_line(color = "black")) +
  coord_flip()
print(gene_importance_plot)

# Create a data frame for plotting the correlation between actual and predicted values
correlation_data <- data.frame(Predicted = predictions, Actual = test_data[, target_column])

# Plot the correlation between actual and predicted values
p <- ggplot(correlation_data, aes(x = Actual, y = Predicted)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm", color = "lightcoral") +
  labs(x = "Actual Values", y = "Predicted Values", 
       title = "Correlation between Actual and Predicted Values") +
  theme_minimal() +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.line = element_line(color = "black"))
print(p)

# Perform and print the correlation test result (using Pearson method)
correlation_test <- cor.test(correlation_data$Actual, correlation_data$Predicted, method = "pearson")
print(correlation_test)
