1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
//! Functions for working with FASTA files.

use std::rc::Rc;

use noodles_core::Position;
use noodles_fasta::record::Sequence;
use noodles_fasta::{self, Record};

/// One Record will be split into multiple RecordPieces.
/// The original Record is kept as an Rc so that each of the
/// RecordPieces can share the same ownership.
pub struct RecordPiece {
    pub record: Rc<Record>,
    pub start: Position,
    pub end: Position,
}

impl RecordPiece {
    fn new(record: Rc<Record>, start: Position, end: Position) -> Self {
        Self { record, start, end }
    }

    /// Get the sequence of the RecordPiece by slicing into the original Record.
    pub fn sequence(&self) -> Sequence {
        self.record.sequence().slice(self.start..=self.end).unwrap()
    }
}

#[allow(dead_code)]
/// Given a record, split the sequence by runs of Ns.
///
/// Returns a vector of records, each with a sequence that does not contain any Ns.
/// The description of each record is set to the start-end position of the sequence,
/// the positions being 1-based.
///
/// Input:
/// ```text
/// >chr42
/// ATGCATGC
/// NNNNATGC
/// A
/// ```
///
/// Output:
/// ```text
/// >chr42 1-8
/// ATGCATGC
/// >chr42 13-17
/// ATGCA
/// ```
pub fn split_seq_by_n(record: Record) -> Vec<RecordPiece> {
    let mut records = Vec::new();
    let n = record.sequence().len();
    let seq = record.sequence().as_ref();
    let mut pos = 0;
    // classic two-pointer approach is tried-and-true
    // but might not be the most idiomatic Rust
    while pos < n {
        while (pos < n) && (seq[pos] == b'N') {
            pos += 1;
        }
        let left = pos;
        while (pos < n) && (seq[pos] != b'N') {
            pos += 1;
        }
        let right = pos;
        if left < right {
            // Position is 1-based so add 1 to left
            let start = Position::try_from(left + 1).unwrap();
            let end = Position::try_from(right).unwrap();
            let rec_rc = Rc::new(record.to_owned());
            let piece = RecordPiece::new(rec_rc, start, end);
            records.push(piece);
        }
    }
    records
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_read_fasta() {
        // two sequences
        let src = b">sq0\nACGT\n>sq1\nN\n";
        let mut reader = noodles_fasta::Reader::new(&src[..]);
        let first_rec = reader.records().next().unwrap().unwrap();
        let second_rec = reader.records().next().unwrap().unwrap();
        assert_eq!(first_rec.definition().name(), b"sq0");
        assert_eq!(second_rec.definition().name(), b"sq1");
        let start = Position::try_from(2).unwrap();
        let end = Position::try_from(3).unwrap();
        assert_eq!(
            first_rec.sequence().slice(start..=end).unwrap().as_ref(),
            b"CG".to_vec()
        );
    }

    #[test]
    fn test_windows() {
        let seq = b"ACGTACGTACGTACGTACGT";
        let mut seq_bytes = seq.windows(3);
        assert_eq!(seq_bytes.next().unwrap(), b"ACG");
        assert_eq!(seq_bytes.next().unwrap(), b"CGT");
        assert_eq!(seq_bytes.next().unwrap(), b"GTA");
    }

    #[test]
    fn test_splitting() {
        let src = b">chr42\nATGCATGCNNNNATGCA\n";
        let mut reader = noodles_fasta::Reader::new(&src[..]);
        let split_records: Vec<_> = reader
            .records()
            .flat_map(|rec| split_seq_by_n(rec.unwrap()))
            .collect();
        assert_eq!(split_records.len(), 2);
        assert_eq!(split_records[0].sequence().as_ref(), b"ATGCATGC".to_vec());
        assert_eq!(split_records[1].sequence().as_ref(), b"ATGCA".to_vec());
        assert_eq!(split_records[1].sequence().as_ref(), b"ATGCA".to_vec());
        assert_eq!(usize::from(split_records[1].start), 13);
        assert_eq!(usize::from(split_records[1].end), 17);
    }

    #[test]
    fn test_splitting_empty() {
        let src = b">chr42\n\n";
        let mut reader = noodles_fasta::Reader::new(&src[..]);
        let split_records: Vec<_> = reader
            .records()
            .flat_map(|rec| split_seq_by_n(rec.unwrap()))
            .collect();
        assert_eq!(split_records.len(), 0);
    }
}