From 1820ce8dce5072c10cdc9d9a7230ca4539f6c408 Mon Sep 17 00:00:00 2001 From: Panciera Date: Tue, 20 Dec 2016 17:44:41 -0500 Subject: [PATCH 1/4] initial commint of vcfmerge --- src/VcfMerge.hs | 50 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 src/VcfMerge.hs diff --git a/src/VcfMerge.hs b/src/VcfMerge.hs new file mode 100644 index 0000000..a17a2ca --- /dev/null +++ b/src/VcfMerge.hs @@ -0,0 +1,50 @@ +{-# LANGUAGE OverloadedStrings #-} + +import Safe (atMay, headMay, lastMay, readMay) +import Data.Maybe (fromMaybe) +import qualified Data.Text as T +import Data.Text (Text()) +import Data.List (find, intercalate) +import Control.Monad (when) + +pos :: Text -> Int +pos s = fromMaybe (-1) (toInt =<< (nth 1 $ T.splitOn "\t" s)) + +toInt x = readMay $ T.unpack x :: Maybe Int +nth = flip atMay +readNuc x = readMay $ T.unpack x :: Maybe Nucleotide +data Nucleotide = A | T | C | G + deriving (Show, Eq, Read) + +data Stats = Stats { dp :: Int, alts :: [Nucleotide], pacs :: [Int] } + deriving (Show, Eq) +-- +formatStats Stats {dp=dp', alts=alts', pacs=pacs'} = s where + s = (show dp') ++ "\t" ++ (intercalate "," $ zipWith (\x y -> (show x) ++ "==" ++ (show y)) alts' pacs') +-- +-- +s = "Den4/KDH0146A/Thailand/Unk/Den4_1\t6\t.\tA\tG,C,T\t.\t.\tDP=2998;RC=2973;RAQ=37;PRC=99;AC=1,10,14;AAQ=37,37,37;PAC=3,2,1;CBD=2973;CB=A;HPOLY" +---- get DP; then depending on number of alts, expect and get that number of PAC (note PAC may be zero) +-- | hey +-- >>> parseInfo s +-- Just (Stats {dp = 2998, alts = [G,C,T], pacs = [3,2,1]}) +parseInfo :: Text -> Maybe Stats +parseInfo line = do + info <- lastMay $ T.splitOn "\t" line + let fields = T.splitOn ";" info + dp' <- toInt =<< fieldValue "DP" fields + rawAlts' <- T.splitOn "," <$> nth 4 cols + alts' <- sequence $ map readNuc rawAlts' -- note mapM = sequence . map + return alts' + pac' <- fieldValue "PAC" fields + pacs' <- sequence $ map toInt $ T.splitOn "," pac' + if (length alts' == length pacs') then + return $ Stats { dp = dp', alts = alts', pacs = pacs'} + else Nothing + where + cols = T.splitOn "\t" line + fieldValue pre xs = nth 1 =<< T.splitOn "=" <$> find (T.isPrefixOf pre) xs + +-- output as TSV, then join with MAAPs output on CHROM and POS + +main = putStrLn $ show $ parseInfo s From 981409bed892be87f39994ec2520531cb97918e0 Mon Sep 17 00:00:00 2001 From: Panciera Date: Wed, 21 Dec 2016 18:34:05 -0500 Subject: [PATCH 2/4] add VCFmerge (executable must be compiled separately) --- Maaps.cabal | 1 + src/VcfMerge.hs | 91 ++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 76 insertions(+), 16 deletions(-) diff --git a/Maaps.cabal b/Maaps.cabal index 13040a2..b6aae38 100644 --- a/Maaps.cabal +++ b/Maaps.cabal @@ -29,6 +29,7 @@ library , biocore , fixed-list , QuickCheck + , safe default-language: Haskell2010 executable Maaps-exe diff --git a/src/VcfMerge.hs b/src/VcfMerge.hs index a17a2ca..22b1e46 100644 --- a/src/VcfMerge.hs +++ b/src/VcfMerge.hs @@ -1,14 +1,23 @@ {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE DataKinds #-} -- last three for Options +{-# LANGUAGE TypeOperators #-} +{-# LANGUAGE DeriveGeneric #-} -import Safe (atMay, headMay, lastMay, readMay) -import Data.Maybe (fromMaybe) +import Safe (atMay, headMay, lastMay, readMay, tailMay) +import Data.Maybe (catMaybes, fromMaybe) import qualified Data.Text as T +import qualified Data.Text.IO as T import Data.Text (Text()) -import Data.List (find, intercalate) +import Data.List (find, groupBy) import Control.Monad (when) +import Options.Generic -pos :: Text -> Int -pos s = fromMaybe (-1) (toInt =<< (nth 1 $ T.splitOn "\t" s)) +--pos :: Text -> Int +--pos s = fromMaybe (-1) (toInt =<< (nth 1 $ T.splitOn "\t" s)) +data Options = Options { maaps :: FilePath "Tab delimited MAAPs output" + , vcf :: FilePath "VCF ouptut of ngsmapper." } + deriving (Generic, Show) +instance ParseRecord Options toInt x = readMay $ T.unpack x :: Maybe Int nth = flip atMay @@ -16,30 +25,80 @@ readNuc x = readMay $ T.unpack x :: Maybe Nucleotide data Nucleotide = A | T | C | G deriving (Show, Eq, Read) -data Stats = Stats { dp :: Int, alts :: [Nucleotide], pacs :: [Int] } +data Stats = Stats { dp :: Int, alts :: [Nucleotide], pacs :: [Int], pos :: Int, ref :: Text } deriving (Show, Eq) -- -formatStats Stats {dp=dp', alts=alts', pacs=pacs'} = s where - s = (show dp') ++ "\t" ++ (intercalate "," $ zipWith (\x y -> (show x) ++ "==" ++ (show y)) alts' pacs') --- +formatStats Stats {dp=dp', alts=alts', pacs=pacs', pos=_, ref=_} = s where + s = T.intercalate "\t" [(tshow dp'), showAlts alts' pacs'] + showAlts :: [Nucleotide] -> [Int] -> Text + showAlts [] [] = "." + showAlts xs ys = T.intercalate "," $ zipWith (\x y -> (tshow x) `T.append` "==" `T.append` (tshow y)) xs ys + +tshow :: (Show a) => a -> Text +tshow = T.pack . show -- +--T s = "Den4/KDH0146A/Thailand/Unk/Den4_1\t6\t.\tA\tG,C,T\t.\t.\tDP=2998;RC=2973;RAQ=37;PRC=99;AC=1,10,14;AAQ=37,37,37;PAC=3,2,1;CBD=2973;CB=A;HPOLY" + +byRef xs = groupBy (\x y -> (headMay x) == (headMay y)) xs + +readMaaps = filter (\x -> length x > 1) . map T.words . tail . T.lines +-- note that ngs_mapper vcfs have an entry for every position, even if there are no ALTs there. so suck! +readVCF :: Text -> [Stats] +readVCF = catMaybes . map parseInfo . dropWhile (T.isPrefixOf "#") . T.lines +main = do + opts <- (getRecord "Starting maaps" :: IO Options) + let vcf' = unHelpful $ vcf opts + let tsv = unHelpful $ maaps opts + stats <- readVCF <$> T.readFile vcf' + maaps <- readMaaps <$> T.readFile tsv +-- putStrLn $ unlines $ map show $ joinMaapsStats stats maaps + maapsHeader <- head <$> T.lines <$> T.readFile tsv + let header = maapsHeader `T.append` "\tDP\tAlts" + putStrLn $ T.unpack header + putStrLn $ process stats maaps + +process ms ss = T.unpack $ T.unlines $ map (uncurry output) $ joinMaapsStats ms ss + +type MRec = [Text] + +--output :: Maybe (MRec, Stats) -> String +output :: MRec -> Maybe Stats -> Text +output mr (Just stats) = (T.intercalate "\t" mr) `T.append` "\t" `T.append` (formatStats stats) +output mr _ = T.intercalate "\t" (mr ++ ["-", "-"]) + +joinMaapsStats :: [Stats] -> [MRec] -> [(MRec, Maybe Stats)] +joinMaapsStats tsvs maaps = map (\m -> (m, find (match m) tsvs)) maaps +-- f m = do +-- r <- find (match m) tsvs +-- return (r, m) + +match :: [Text] -> Stats -> Bool +match x y = ((ref' x) == (Just $ ref y)) + && ((toInt =<< nth 2 x) == (Just $ pos y)) where +ref' x = T.tail <$> snd <$> T.breakOn "_" <$> headMay x + + +--groupBy (\x y -> ( (pos y)) ---- get DP; then depending on number of alts, expect and get that number of PAC (note PAC may be zero) -- | hey -- >>> parseInfo s -- Just (Stats {dp = 2998, alts = [G,C,T], pacs = [3,2,1]}) parseInfo :: Text -> Maybe Stats parseInfo line = do - info <- lastMay $ T.splitOn "\t" line + info <- lastMay cols let fields = T.splitOn ";" info dp' <- toInt =<< fieldValue "DP" fields rawAlts' <- T.splitOn "," <$> nth 4 cols - alts' <- sequence $ map readNuc rawAlts' -- note mapM = sequence . map - return alts' - pac' <- fieldValue "PAC" fields - pacs' <- sequence $ map toInt $ T.splitOn "," pac' + let altsMaybe' = mapM readNuc rawAlts' -- note mapM = sequence . map + let pacMaybe' = mapM toInt =<< T.splitOn "," <$> fieldValue "PAC" fields + let pacs' = fromMaybe [] pacMaybe' :: [Int] + let alts' = fromMaybe [] altsMaybe' + --pacs' <- sequence $ map toInt $ T.splitOn "," pac' + pos' <- toInt =<< nth 1 cols + ref' <- nth 0 $ T.splitOn "\t" line if (length alts' == length pacs') then - return $ Stats { dp = dp', alts = alts', pacs = pacs'} + return $ Stats { dp = dp', alts = alts', pacs = pacs', pos = pos', ref = ref'} else Nothing where cols = T.splitOn "\t" line @@ -47,4 +106,4 @@ parseInfo line = do -- output as TSV, then join with MAAPs output on CHROM and POS -main = putStrLn $ show $ parseInfo s +--main = putStrLn $ show $ parseInfo s From 2a50b7e35038666b859f9e89fa3c4935e738002f Mon Sep 17 00:00:00 2001 From: Michael Panciera Date: Tue, 7 Feb 2017 14:39:34 -0500 Subject: [PATCH 3/4] fixed parsing bug where got excluded --- src/VcfMerge.hs | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/src/VcfMerge.hs b/src/VcfMerge.hs index 22b1e46..6264bde 100644 --- a/src/VcfMerge.hs +++ b/src/VcfMerge.hs @@ -8,7 +8,7 @@ import Data.Maybe (catMaybes, fromMaybe) import qualified Data.Text as T import qualified Data.Text.IO as T import Data.Text (Text()) -import Data.List (find, groupBy) +import Data.List (find, groupBy, group) import Control.Monad (when) import Options.Generic @@ -22,9 +22,10 @@ instance ParseRecord Options toInt x = readMay $ T.unpack x :: Maybe Int nth = flip atMay readNuc x = readMay $ T.unpack x :: Maybe Nucleotide -data Nucleotide = A | T | C | G +data Nucleotide = A | T | C | G | (*) deriving (Show, Eq, Read) + data Stats = Stats { dp :: Int, alts :: [Nucleotide], pacs :: [Int], pos :: Int, ref :: Text } deriving (Show, Eq) -- @@ -44,15 +45,15 @@ byRef xs = groupBy (\x y -> (headMay x) == (headMay y)) xs readMaaps = filter (\x -> length x > 1) . map T.words . tail . T.lines -- note that ngs_mapper vcfs have an entry for every position, even if there are no ALTs there. so suck! +--readVCF = catMaybes . map parseInfo . dropWhile (T.isPrefixOf "#") . T.lines readVCF :: Text -> [Stats] -readVCF = catMaybes . map parseInfo . dropWhile (T.isPrefixOf "#") . T.lines +readVCF x = fromMaybe (error "Failed to parse vcf file.") $ traverse parseInfo $ dropWhile (T.isPrefixOf "#") $ T.lines x main = do opts <- (getRecord "Starting maaps" :: IO Options) let vcf' = unHelpful $ vcf opts let tsv = unHelpful $ maaps opts stats <- readVCF <$> T.readFile vcf' maaps <- readMaaps <$> T.readFile tsv --- putStrLn $ unlines $ map show $ joinMaapsStats stats maaps maapsHeader <- head <$> T.lines <$> T.readFile tsv let header = maapsHeader `T.append` "\tDP\tAlts" putStrLn $ T.unpack header @@ -60,23 +61,25 @@ main = do process ms ss = T.unpack $ T.unlines $ map (uncurry output) $ joinMaapsStats ms ss -type MRec = [Text] ---output :: Maybe (MRec, Stats) -> String -output :: MRec -> Maybe Stats -> Text +--output :: Maybe ([Text], Stats) -> String +output :: [Text] -> Maybe Stats -> Text output mr (Just stats) = (T.intercalate "\t" mr) `T.append` "\t" `T.append` (formatStats stats) -output mr _ = T.intercalate "\t" (mr ++ ["-", "-"]) +output mr Nothing = T.intercalate "\t" (mr ++ ["-", "-"]) -joinMaapsStats :: [Stats] -> [MRec] -> [(MRec, Maybe Stats)] -joinMaapsStats tsvs maaps = map (\m -> (m, find (match m) tsvs)) maaps +joinMaapsStats :: [Stats] -> [[Text]] -> [([Text], Maybe Stats)] +joinMaapsStats vcf maaps = map (\m -> (m, find (matcher m) vcf)) maaps + where matcher = if (length $ group $ fmap ref vcf) == 1 then matchPos else matchBoth -- f m = do -- r <- find (match m) tsvs -- return (r, m) - -match :: [Text] -> Stats -> Bool -match x y = ((ref' x) == (Just $ ref y)) - && ((toInt =<< nth 2 x) == (Just $ pos y)) where -ref' x = T.tail <$> snd <$> T.breakOn "_" <$> headMay x +--matchPos x y = (toInt =<< nth 2 x) == (Just $ pos y) +matchPos :: [Text] -> Stats -> Bool +matchPos x y = (read $ T.unpack (x !! 2)) == (pos y) +matchBoth :: [Text] -> Stats -> Bool +matchBoth x y = ((ref' x) == (ref y)) && matchPos x y +--ref' x = T.tail <$> snd <$> T.breakOn "_" <$> headMay x +ref' x = T.tail $ snd $ T.breakOn "_" $ head x --groupBy (\x y -> ( (pos y)) From bb43f7988580858889543e0a1d51217a381f669a Mon Sep 17 00:00:00 2001 From: Michael Panciera Date: Tue, 7 Feb 2017 14:47:19 -0500 Subject: [PATCH 4/4] output includes Ref/CHROM --- src/VcfMerge.hs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/VcfMerge.hs b/src/VcfMerge.hs index 6264bde..0a71d6f 100644 --- a/src/VcfMerge.hs +++ b/src/VcfMerge.hs @@ -29,8 +29,8 @@ data Nucleotide = A | T | C | G | (*) data Stats = Stats { dp :: Int, alts :: [Nucleotide], pacs :: [Int], pos :: Int, ref :: Text } deriving (Show, Eq) -- -formatStats Stats {dp=dp', alts=alts', pacs=pacs', pos=_, ref=_} = s where - s = T.intercalate "\t" [(tshow dp'), showAlts alts' pacs'] +formatStats Stats {dp=dp', alts=alts', pacs=pacs', pos=_, ref=ref'} = s where + s = T.intercalate "\t" [(tshow dp'), showAlts alts' pacs', ref'] showAlts :: [Nucleotide] -> [Int] -> Text showAlts [] [] = "." showAlts xs ys = T.intercalate "," $ zipWith (\x y -> (tshow x) `T.append` "==" `T.append` (tshow y)) xs ys @@ -55,7 +55,7 @@ main = do stats <- readVCF <$> T.readFile vcf' maaps <- readMaaps <$> T.readFile tsv maapsHeader <- head <$> T.lines <$> T.readFile tsv - let header = maapsHeader `T.append` "\tDP\tAlts" + let header = maapsHeader `T.append` "\tDP\tAlts\tRef" putStrLn $ T.unpack header putStrLn $ process stats maaps