library(dplyr)
library(tidyr)
library(ggplot2)
library(readr)
library(purrr)
library(stringr)


##A3A
# Step 0: Load multiple tsv files from a directory

tsv_files <- list.files("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/37_bam_depth/gene_sequence_analysis", pattern = ".base.txt$", full.names = TRUE)

# Read all files and bind them
df_raw <- map_df(tsv_files, read_tsv)


# Step 1: Reference sequence (edit as needed)
ref_seq_string <- read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/37_bam_depth/gene_sequence_analysis/A3A.ref.txt")$seq  # Replace with your actual reference sequence string
ref_seq <- strsplit(ref_seq_string, "")[[1]]
#df%>%filter(APOBEC=="A3A")
names(ref_seq) <- sort(unique((df_raw%>%filter(APOBEC=="A3A"))$POS))  # Assign positions as names

# Step 2: Melt and calculate base percentages
df_long <- df_raw %>%
  pivot_longer(cols = A:T, names_to = "Base", values_to = "RawCount") %>%
  rename(BAM = id, Position = POS) %>%
  group_by(BAM, Position) %>%
  mutate(
    Total = sum(RawCount),
    Percent = ifelse(Total > 0, 100 * RawCount / Total, 0),
    RefBase = ref_seq[as.character(Position)]
  ) %>%
  ungroup()
df_long%>%filter(APOBEC=="A3A")%>%
  write.table("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/37_bam_depth/gene_sequence_analysis/A3A_count.txt",
              sep="\t",
              quote=F,
              row.names=F)

df_long%>%select(-RawCount)%>%
  spread(Base,Percent)%>%
  write.table("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/37_bam_depth/gene_sequence_analysis/A3A_percent.txt",
              sep="\t",
              quote=F,
              row.names=F)
# Step 3: Identify major *non-reference* base with ≥10%
major_alt_df <- df_long %>%
  filter(Base != RefBase, Percent >= 10) %>%
  group_by(Position, BAM) %>%
  slice_max(order_by = Percent, n = 1, with_ties = FALSE) %>%
  ungroup()

# Step 4: Assign color to the major non-reference allele
major_plot_df <- expand.grid(
  Position = unique((df_raw%>%filter(APOBEC=="A3A"))$POS),
  BAM = unique((df_raw%>%filter(APOBEC=="A3A"))$id)
) %>%
  left_join(major_alt_df %>% select(Position, BAM, Base, Percent), by = c("Position", "BAM")) %>%
  mutate(
    BaseColor = case_when(
      is.na(Base) ~ "#f0f0f0",
      Base == "A" ~ "red",
      Base == "C" ~ "blue",
      Base == "G" ~ "orange",
      Base == "T" ~ "green",
      TRUE ~ "white"
    ),
    FillColor = ifelse(
      BaseColor == "#f0f0f0",
      "grey90",  # grey tile, fixed color
      alpha(BaseColor, Percent / 100)  # intensity by Percent
    )
  )
base_colors <- c(A = "red", C = "blue", G = "orange", T = "green")

# Apply alpha blending to get fill color per base and % (0–100%)
major_plot_df <- major_plot_df %>%
  mutate(
    FillColor = case_when(
      Base %in% names(base_colors) ~ alpha(base_colors[Base], Percent / 100),
      TRUE ~ "#f0f0f0"
    )
  )

legend_df <- expand.grid(
  Base = names(base_colors),
  Percent = seq(0, 100, by = 10)
) %>%
  mutate(FillColor = alpha(base_colors[Base], Percent / 100))
# (2) y축 텍스트 표시 여부 결정: grey 외 색상 있는 row만 이름 남기기
label_positions <- major_plot_df %>%
  group_by(Position) %>%
  summarise(HasColor = any(FillColor != "#f0f0f0")) %>%
  mutate(YLabel = ifelse(HasColor, as.character(Position), ""))

# merge back to plot_df
major_plot_df <- major_plot_df %>%
  left_join(label_positions %>% select(Position, YLabel), by = "Position")
major_plot_df$BAM%>%unique()
A_col_order<-c("YHA-A3B-1st-Bulk-wgs-ILLUMINA",
               "A3A_1st_C3",
               "A3A_C3_TP53KO_C3",
               "A3A_1st_C3_neg-1",
               "A3A_1st_C3_neg-2",
               "A3A_1st_C3_neg-3",
               "A3A_1st_C3_100ng-1",
               "A3A_1st_C3_100ng-2",
               "A3A_1st_C3_100ng-3",
               "A3A_C3_100-4",
               "A3A_C3_100-5",
               "A3A_1st_C3_3ug-1",
               "A3A_1st_C3_3ug-2",
               "A3A_1st_C3_3ug-3",
               "A3A_C3_3-4",
               "A3A_C3_3-5",
               "A3A_C3_TP53_C3_Ctrl-1",
               "A3A_C3_TP53_C3_Ctrl-2",
               "A3A_C3_TP53_C3_Ctrl-3",
               "A3A_C3_TP53_C3_100-1",
               "A3A_C3_TP53_C3_100-2",
               "A3A_C3_TP53_C3_100-3",
               "A3A_C3_TP53_C3_3-1",
               "A3A_C3_TP53_C3_3-2",
               "A3A_C3_TP53_C3_3-3"
)

major_plot_df$BAM<-factor(major_plot_df$BAM,levels=A_col_order)


# (5) Plot
####
column_outline_df <- major_plot_df %>%
  mutate(
    x_center = as.numeric(factor(BAM)),
    y_center = as.numeric(factor(Position))
  ) %>%
  group_by(BAM) %>%
  summarise(
    xmin = min(x_center) - 0.5,
    xmax = max(x_center) + 0.5,
    ymin = min(y_center) - 0.5,
    ymax = max(y_center) + 0.5,
    .groups = "drop"
  ) %>%
  mutate(x = (xmin + xmax) / 2)  # just to keep a unique x for plotting order
p <-  ggplot(major_plot_df, aes(x = BAM, y = factor(Position))) +
  # Heatmap tiles
  geom_tile(aes(fill = FillColor), color = NA) +

  # Draw black rectangle around each sample column
  geom_rect(
    data = column_outline_df,
    aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
    color = "black", fill = NA, linewidth = 0.1,
    inherit.aes = FALSE
  ) +

  scale_fill_identity() +

  # Y-axis label for non-grey rows only
  scale_y_discrete(
    labels = function(pos) {
      label_positions$YLabel[match(pos, label_positions$Position)]
    }
  ) +

  labs(title = "Non-Reference Major Alleles (Color = Base, Intensity = %)",
       x = "Sample", y = NULL) +

  theme_minimal(base_size = 14) +
  theme(
    axis.text.x = element_text(size = 10, angle = 90,vjust=0.5),  # (1) x축 tick 크기, 기울기
    axis.ticks.length.x=unit(0.25, "cm"),
    axis.text.y = element_text(size = 8),
    axis.ticks.x = element_line(size = 1),
    axis.ticks.y = element_blank(),           # y축 눈금선 제거
    panel.grid = element_blank(),
    panel.border = element_blank()
  )
p
# Legend plot
legend_plot <- ggplot(legend_df, aes(x = Percent, y = Base, fill = FillColor)) +
  geom_tile() +
  scale_fill_identity() +
  scale_x_continuous(expand = c(0, 0)) +
  theme_minimal(base_size = 12) +
  labs(x = "Percentage (%)", y = "Base") +
  theme(
    axis.text.y = element_text(color = base_colors[legend_df$Base]),
    panel.grid = element_blank()
  )

legend_plot

library(patchwork)
p_A3A<-p + legend_plot + plot_layout(widths = c(4, 1))
p_A3A
ggsave("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/37_bam_depth/gene_sequence_analysis/A3A_seq.pdf",p_A3A,
       width=8,height=10)


##A3B


# Step 1: Reference sequence (edit as needed)
ref_seq_string <- read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/37_bam_depth/gene_sequence_analysis/A3B.ref.txt")$seq  # Replace with your actual reference sequence string
ref_seq <- strsplit(ref_seq_string, "")[[1]]
df%>%filter(APOBEC=="A3A")
names(ref_seq) <- sort(unique((df_raw%>%filter(APOBEC=="A3B"))$POS))  # Assign positions as names

# Step 2: Melt and calculate base percentages
df_long <- df_raw %>%
  pivot_longer(cols = A:T, names_to = "Base", values_to = "RawCount") %>%
  rename(BAM = id, Position = POS) %>%
  group_by(BAM, Position) %>%
  mutate(
    Total = sum(RawCount),
    Percent = ifelse(Total > 0, 100 * RawCount / Total, 0),
    RefBase = ref_seq[as.character(Position)]
  ) %>%
  ungroup()
df_long%>%filter(APOBEC=="A3B")
df_long%>%filter(APOBEC=="A3B")%>%
  write.table("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/37_bam_depth/gene_sequence_analysis/A3B_count.txt",
              sep="\t",
              quote=F,
              row.names=F)

df_long%>%filter(APOBEC=="A3B")%>%select(-RawCount)%>%
  spread(Base,Percent)%>%
  write.table("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/37_bam_depth/gene_sequence_analysis/A3B_percent.txt",
              sep="\t",
              quote=F,
              row.names=F)
# Step 3: Identify major *non-reference* base with ≥10%
major_alt_df <- df_long %>%
  filter(Base != RefBase, Percent >= 10) %>%
  group_by(Position, BAM) %>%
  slice_max(order_by = Percent, n = 1, with_ties = FALSE) %>%
  ungroup()

# Step 4: Assign color to the major non-reference allele
major_plot_df <- expand.grid(
  Position = unique((df_raw%>%filter(APOBEC=="A3B"))$POS),
  BAM = unique((df_raw%>%filter(APOBEC=="A3B"))$id)
) %>%
  left_join(major_alt_df %>% select(Position, BAM, Base, Percent), by = c("Position", "BAM")) %>%
  mutate(
    BaseColor = case_when(
      is.na(Base) ~ "#f0f0f0",
      Base == "A" ~ "red",
      Base == "C" ~ "blue",
      Base == "G" ~ "orange",
      Base == "T" ~ "green",
      TRUE ~ "white"
    ),
    FillColor = ifelse(
      BaseColor == "#f0f0f0",
      "grey90",  # grey tile, fixed color
      alpha(BaseColor, Percent / 100)  # intensity by Percent
    )
  )
base_colors <- c(A = "red", C = "blue", G = "orange", T = "green")

# Apply alpha blending to get fill color per base and % (0–100%)
major_plot_df <- major_plot_df %>%
  mutate(
    FillColor = case_when(
      Base %in% names(base_colors) ~ alpha(base_colors[Base], Percent / 100),
      TRUE ~ "#f0f0f0"
    )
  )

legend_df <- expand.grid(
  Base = names(base_colors),
  Percent = seq(0, 100, by = 10)
) %>%
  mutate(FillColor = alpha(base_colors[Base], Percent / 100))
# (2) y축 텍스트 표시 여부 결정: grey 외 색상 있는 row만 이름 남기기
label_positions <- major_plot_df %>%
  group_by(Position) %>%
  summarise(HasColor = any(FillColor != "#f0f0f0")) %>%
  mutate(YLabel = ifelse(HasColor, as.character(Position), ""))

# merge back to plot_df
major_plot_df <- major_plot_df %>%
  left_join(label_positions %>% select(Position, YLabel), by = "Position")
major_plot_df$BAM%>%unique()

B_col_order<-c(
"YHA-A3B-1st-Bulk-wgs-ILLUMINA",
"YHA-A3B-1st-C5-wgs-ILLUMINA",
"C5_TP53_KO",
"A3B_1st_C5_100ng_48h_SC-1",
"A3B_1st_C5_100ng_48h_SC-3",
"A3B_1st_C5_TP53KO_48h_ctrl_C1",
"A3B_1st_C5_TP53KO_48h_ctrl_C2",
"A3B_1st_C5_TP53KO_48h_ctrl_C3",
"A3B_1st_C5_TP53KO_48h_100ng_C1",
"A3B_1st_C5_TP53KO_48h_100ng_C2",
"A3B_1st_C5_TP53KO_48h_100ng_C12",
"A3B_1st_C5_TP53KO_48h_3ug_C5",
"A3B_1st_C5_TP53KO_48h_3ug_C9",
"A3B_1st_C5_TP53KO_48h_3ug_C14"
)

A_col_order<-c("YHA-A3B-1st-Bulk-wgs-ILLUMINA",
               "A3A_1st_C3_neg-1",
               "A3A_1st_C3_neg-2",
               "A3A_1st_C3_neg-3",
               "A3A_1st_C3_100ng-1",
               "A3A_1st_C3_100ng-2",
               "A3A_1st_C3_100ng-3",
               "A3A_C3_100-4",
               "A3A_C3_100-5",
               "A3A_1st_C3_3ug-1",
               "A3A_1st_C3_3ug-2",
               "A3A_1st_C3_3ug-3",
               "A3A_C3_3-4",
               "A3A_C3_3-5",
               "A3A_C3_TP53_C3_Ctrl-1",
               "A3A_C3_TP53_C3_Ctrl-2",
               "A3A_C3_TP53_C3_Ctrl-3",
               "A3A_C3_TP53_C3_100-1",
               "A3A_C3_TP53_C3_100-2",
               "A3A_C3_TP53_C3_100-3",
               "A3A_C3_TP53_C3_3-1",
               "A3A_C3_TP53_C3_3-2",
               "A3A_C3_TP53_C3_3-3"
)

major_plot_df$BAM<-factor(major_plot_df$BAM,levels=B_col_order)


# (5) Plot
####
column_outline_df <- major_plot_df %>%
  mutate(
    x_center = as.numeric(factor(BAM)),
    y_center = as.numeric(factor(Position))
  ) %>%
  group_by(BAM) %>%
  summarise(
    xmin = min(x_center) - 0.5,
    xmax = max(x_center) + 0.5,
    ymin = min(y_center) - 0.5,
    ymax = max(y_center) + 0.5,
    .groups = "drop"
  ) %>%
  mutate(x = (xmin + xmax) / 2)  # just to keep a unique x for plotting order
p <-  ggplot(major_plot_df, aes(x = BAM, y = factor(Position))) +
  # Heatmap tiles
  geom_tile(aes(fill = FillColor), color = NA) +

  # Draw black rectangle around each sample column
  geom_rect(
    data = column_outline_df,
    aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
    color = "black", fill = NA, linewidth = 0.1,
    inherit.aes = FALSE
  ) +

  scale_fill_identity() +

  # Y-axis label for non-grey rows only
  scale_y_discrete(
    labels = function(pos) {
      label_positions$YLabel[match(pos, label_positions$Position)]
    }
  ) +

  labs(title = "Non-Reference Major Alleles (Color = Base, Intensity = %)",
       x = "Sample", y = NULL) +

  theme_minimal(base_size = 14) +
  theme(
    axis.text.x = element_text(size = 10, angle = 90,vjust=0.5),  # (1) x축 tick 크기, 기울기
    axis.ticks.length.x=unit(0.25, "cm"),
    axis.text.y = element_text(size = 8),
    axis.ticks.x = element_line(size = 1),
    axis.ticks.y = element_blank(),           # y축 눈금선 제거
    panel.grid = element_blank(),
    panel.border = element_blank()
  )
p
# Legend plot
legend_plot <- ggplot(legend_df, aes(x = Percent, y = Base, fill = FillColor)) +
  geom_tile() +
  scale_fill_identity() +
  scale_x_continuous(expand = c(0, 0)) +
  theme_minimal(base_size = 12) +
  labs(x = "Percentage (%)", y = "Base") +
  theme(
    axis.text.y = element_text(color = base_colors[legend_df$Base]),
    panel.grid = element_blank()
  )

legend_plot

library(patchwork)
p_A3B<-p + legend_plot + plot_layout(widths = c(4, 1))
p_A3A
p_A3B
ggsave("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WGS/37_bam_depth/gene_sequence_analysis/A3B_seq.pdf",p_A3B,
       width=10,height=14)
