-
Notifications
You must be signed in to change notification settings - Fork 4
/
RLZ_index.h
217 lines (179 loc) · 8.35 KB
/
RLZ_index.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
/* RLZ index
* Copyright (C) 2011 Shanika Kuruppu
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/*
* RLZ - Relative Lempel Ziv
* Implements a self-index using RLZ.
* Authors: Shanika Kuruppu (kuruppu@csse.unimelb.edu.au)
*/
#include <stdio.h>
#include <vector>
#include <Array.h>
#include <BitSequenceSDArray.h>
#include <BitSequence.h>
#include <TextIndex.h>
typedef struct occurrence
{
uint64_t seq;
uint64_t pos;
} occ_t;
class RLZ_index
{
public:
/* Constructor */
RLZ_index(char *filenames);
/* Destructor */
~RLZ_index();
/** Display function */
void display();
/** Count function */
void count();
/** Search function */
void locate();
/** Factor file decode function */
void decode();
/** Prints the space usage of the RLZ data structure */
int size();
private:
/* Length of the reference sequence */
uint64_t refseqlen;
uint64_t logrefseqlen;
/* Number of sequences */
uint64_t numseqs;
/* Number of factors */
uint64_t numfacs;
/* The names of the input files */
char **filenames;
// If this is true, can only read the data structures required
// to implement display() query
bool displayonly;
/*-------------------------------------------------------------------*/
/* Default data structures and methods */
/*-------------------------------------------------------------------*/
/* Reference sequence as a bit vector with 3bpb encoding
* {a,c,g,t,n} */
cds_utils::Array *refseq;
/* Factor positions as a bit vector using logn bits per position */
cds_utils::Array *positions;
/* Factor facstarts in a rank-select data structure */
cds_static::BitSequenceSDArray *facstarts;
/* Sequence cumseqlens for numseqs sequences */
cds_utils::Array *cumseqlens;
/** Reads the reference sequence into memory and fills in the
* refseqlen and refseq variables. Also creates the suffix array
* of the reference sequence and stores it in SA.
* @param filename Reference sequence file name
*/
void read_reference_sequence(char *filename);
/* Reads the factors into memory and fill the positions, facstarts
* and cumseqlens arrays */
void read_compressed_factors();
/** Retrieves a substring starting at start and ending at end for
* seq.
* @param seq Sequence to retrieve from
* @param start Start position to retrieve from
* @param end End position to stop retrieving
* @return Time taken to evaluate the query in microseconds
*/
long display(uint64_t seq, uint64_t start, uint64_t end,
std::vector<uint> &substring);
/** Retrieves a substring starting and ending at absolute
* positions in the index.
* @param start Start position in the index to retrieve from
* @param end End position in the index to retrieve from
* @param substring Vector to store the retrieved substring
* @return Time taken to evaluate the query in microseconds
*/
long display(uint64_t start, uint64_t end, vector <uint> &substring);
/** Given a factor id, returns the length of that factor.
* @param facidx Factor id
* @return Length of the factor
*/
inline uint64_t factor_length(uint32_t facidx);
/*-------------------------------------------------------------------*/
/* Locate related data structures and methods */
/*-------------------------------------------------------------------*/
// Suffix array of the reference sequence
cds_utils::Array *sa;
// Nested level lists of the sorted factors
cds_utils::Array *nll;
cds_utils::Array *levelidx;
uint32_t numlevels;
// Compressed bit vectors to indicate at which positions in the
// reference sequence factors start and end at
cds_static::BitSequenceRRR *isstart;
cds_static::BitSequenceRRR *isend;
cds_static::BitSequenceSDArray *seqfacstart;
/** Implements the search functionality.
* @param pattern Pattern to search for
* @param ptnlen Length of the pattern being searched for
* @param occs Vector to store occurrences if iscount=false
* @param iscount True if just counting the occurrence, false otherwise
* @return Number of occurrences of the pattern
*/
uint64_t search(const char *pattern, unsigned int ptnlen,
vector<occ_t>& occs, bool iscount=false);
/** Returns the boundaries of the suffix array that contains the
* given pattern. lb and rb are set to (uint64_t)-1 if pattern
* does not occur in the suffix array.
* @param pattern Pattern to search for
* @param lb Variable to store the return left boundary
* @param rb Variable to store the return right boundary
*/
void sa_binary_search(cds_utils::Array &pattern, uint64_t *lb,
uint64_t *rb);
/** Returns the boundaries of a level in the nested level list
* that contains the given interval represented by the variables
* start and end. lb and lr are set to (uint32_t)-1 if the
* interval does not occur in the suffix array.
* @param start Start position of the interval
* @param end End position of the interval
* @param lb Initial left boundary and the place to store the return left boundary
* @param rb INitial right boundary and the place to store the return right boundary
*/
void facs_binary_search(uint64_t start, uint64_t end,
uint32_t *lb, uint32_t *rb);
/** Returns the boundaries of a level in the nested level list
* that contains the factors that start with the given start
* position. lb and lr are set to (uint32_t)-1 if the interval
* does not occur in the suffix array.
* @param start Start position of an interval to search for
* @param lb Initial left boundary and the place to store the return left boundary
* @param rb Initial right boundary and the place to store the return right boundary
*/
void factor_start_binary_search(uint64_t start, uint32_t *lb,
uint32_t *rb);
/** Returns the boundaries of a level in the nested level list
* that contains the factors that end with the given end
* position. lb and lr are set to (uint32_t)-1 if the interval
* does not occur in the suffix array.
* @param end End position of an interval to search for
* @param lb Initial left boundary and the place to store the return left boundary
* @param rb Initial right boundary and the place to store the return right boundary
*/
void factor_end_binary_search(uint64_t end, uint32_t *lb,
uint32_t *rb);
/** Compare the given substring to the reference sequence
* starting from the start position.
* @param substr Substring to search for
* @param start Position in the reference sequence to search from
* @param len Length of the substring to search for
* @return Returns true if the substrings are equal and false otherwise
*/
inline bool compare_substr_to_refseq(cds_utils::Array& substr,
uint64_t start,
uint64_t len);
};