library("SNPlocs.Hsapiens.dbSNP.20101109")
## library(EY4)
library(GenomicFeatures)

## So here are our snpIds
snp_id <- read.csv(system.file("extdata",'snpMetadata.csv',
                               package="AdvancedR2011"))
snpIds <- gsub("^rs","",as.character(t(snp_id)))



## Also want to read in als snps as ranges
snpRanges <- getSNPlocs(unlist(COMPATIBLE_BSGENOMES), as.GRanges=TRUE)
save(snpRanges, file="snpRanges.Rda")
load(system.file("extdata","snpRanges.Rda",package="AdvancedR2011"))

## srs <- snpRanges
## this should work here but doesn't:
## foo <- do.call(c, srs)
## This will work but is painfully slow.  I need to subset all the pieces 1st!
## sr <- unlist(GRangesList(snpRanges))

## subset function
subset <- function(gr){
  idx <- as.integer(t(as.data.frame(values(gr)["RefSNP_id"]))) %in%  snpIds
  gr[idx]
}
srs <- lapply(snpRanges, subset)
sr <- unlist(GRangesList(srs))
# save(sr,file="sr.Rda")


## can just pick up from here:
load(system.file("extdata","sr.Rda",package="AdvancedR2011"))


## Make a txdb for hg19
## txdb <- makeTranscriptDbFromUCSC("hg19")
## saveFeatures(txdb, file="hg19.sqlite")
txdb <- loadFeatures(system.file("extdata","hg19.sqlite",
                                 package="AdvancedR2011"))

txs <- transcriptsBy(txdb)
txsl <- range(txs)
## something bad happens when I unlist my GRL - there are really overly
## complex transcript to gene associations present.
## Some REALLY odd annots like:
## transcriptsBy(txdb, use.names=TRUE)[which(elementLengths(redtxsl) == 3)]

## I could filter, or I could map and proceed 
## redtxs <- unlist(redtxsl)  
## names(redtxs) <- names(txs)
## make sure that gene_id is in the metadata!
## values(redtxs) <- DataFrame(names(redtxs))

## Then match the two sets of ranges together (do this for folks)
## matchAndMerge(snpRanges, redtxs)


## lets map and proceed.
## 1st we have to clean up the seqnames a bit (just clean up the one side)
seqnames(sr) <- gsub("ch","chr",seqnames(sr))
seqnames(sr) <- gsub("MT","M",seqnames(sr))
## overlap
ol = findOverlaps(query=sr, subject=txsl)
## append gene_ids to sr
## would like to do:
## values(sr)[queryHits(ol)] <- names(txsl)[subjectHits(ol)]
## But I can't so I have to do something like this:
Ids <- rep(NA,length(sr))
Ids[queryHits(ol)] <- names(txsl)[subjectHits(ol)]

vals <- DataFrame(values(sr), gene_id=Ids)
values(sr) <- vals
df <- as.data.frame(sr)
rownames(df) <- NULL

## I have to re-sort these to be in the order that they were when they started.
## And I have to add an index (ie. the blank col if row.names is default)
## to resort, I should be able to do this:
names(snp_id) <- "snp_id"
df = cbind(df[match(snpIds, df[,"RefSNP_id"]),],snp_id)
## but, I cannot do that right now since I am missing 605 of the snp_ids...
## Well I can do it, but it will be messy.
rownames(df) <- NULL

## But wait, there is more, I have lost 605 SNPs since df has dim of
## [1] 113735      8
## and the original SNP list was 114340 long...
## This is because dbSNP only had 113735 of the 114340 SNPs.
## Obviously, I am not surprised that I could not get them all mapped to a gene.
subsnp <- df[!is.na(df[,"start"]),] ## drop bad snps

## write.csv(df, file="snps.csv", row.names = FALSE)
## For now, lets just wipe out the rownames and then write this out.
rownames(subsnp) <- NULL
write.csv(subsnp, file="snps.csv")








## lets make some quick mods to the subject Data
sub <- read.csv(system.file("extdata",'subjectMetadata.csv',
                            package="AdvancedR2011"))
colnames(sub) <- c("subject_id","disease")

cc <- sub[,"disease"]
cc <- gsub("1","case",as.character(t(cc)))
cc <- gsub("0","control",cc)
names(cc) <- "case_control"
sub <- cbind(sub, case_control=cc)

write.csv(sub, file="subjects.csv")
## rename "" to be "id"


subjects <- read.csv(system.file("extdata","subjects.csv",
                                 package="AdvancedR2011"))



## Then then will need to make tables and populate them etc.
