1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
|
#!/usr/bin/env python
#
# Copyright 2011 Free Software Foundation, Inc.
#
# This file is part of GNU Radio
#
# SPDX-License-Identifier: GPL-3.0-or-later
#
#
"""
This module contains functions for aligning sequences.
>>> import random
>>> rndm = random.Random()
>>> rndm.seed(1234)
>>> ran_seq = [rndm.randint(0,1) for i in range(0, 100)]
>>> offset_seq = [0] * 20 + ran_seq
>>> correct, overlap, offset = align_sequences(ran_seq, offset_seq)
>>> print(correct, overlap, offset)
(1.0, 100, -20)
>>> offset_err_seq = []
>>> for bit in offset_seq:
... if rndm.randint(0,4) == 4:
... offset_err_seq.append(rndm.randint(0,1))
... else:
... offset_err_seq.append(bit)
>>> correct, overlap, offset = align_sequences(ran_seq, offset_err_seq)
>>> print(overlap, offset)
(100, -20)
"""
import random
# DEFAULT PARAMETERS
# If the fraction of matching bits between two sequences is greater than
# this the sequences are assumed to be aligned.
def_correct_cutoff = 0.9
# The maximum offset to test during sequence alignment.
def_max_offset = 500
# The maximum number of samples to take from two sequences to check alignment.
def_num_samples = 1000
def compare_sequences(d1, d2, offset, sample_indices=None):
"""
Takes two binary sequences and an offset and returns the number of
matching entries and the number of compared entries.
d1 & d2 -- sequences
offset -- offset of d2 relative to d1
sample_indices -- a list of indices to use for the comparison
"""
max_index = min(len(d1), len(d2)+offset)
if sample_indices is None:
sample_indices = list(range(0, max_index))
correct = 0
total = 0
for i in sample_indices:
if i >= max_index:
break
if d1[i] == d2[i-offset]:
correct += 1
total += 1
return (correct, total)
def random_sample(size, num_samples=def_num_samples, seed=None):
"""
Returns a set of random integers between 0 and (size-1).
The set contains no more than num_samples integers.
"""
rndm = random.Random()
rndm.seed(seed)
if num_samples > size:
indices = set(range(0, size))
else:
if num_samples > size / 2:
num_samples = num_samples / 2
indices = set([])
while len(indices) < num_samples:
index = rndm.randint(0, size-1)
indices.add(index)
indices = list(indices)
indices.sort()
return indices
def align_sequences(d1, d2,
num_samples=def_num_samples,
max_offset=def_max_offset,
correct_cutoff=def_correct_cutoff,
seed=None,
indices=None):
"""
Takes two sequences and finds the offset and which the two sequences best
match. It returns the fraction correct, the number of entries compared,
the offset.
d1 & d2 -- sequences to compare
num_samples -- the maximum number of entries to compare
max_offset -- the maximum offset between the sequences that is checked
correct_cutoff -- If the fraction of bits correct is greater than this then
the offset is assumed to optimum.
seed -- a random number seed
indices -- an explicit list of the indices used to compare the two sequences
"""
max_overlap = max(len(d1), len(d2))
if indices is None:
indices = random_sample(max_overlap, num_samples, seed)
max_frac_correct = 0
best_offset = None
best_compared = None
best_correct = None
pos_range = list(range(0, min(len(d1), max_offset)))
neg_range = list(range(-1, -min(len(d2), max_offset), -1))
# Interleave the positive and negative offsets.
int_range = [item for items in zip(pos_range, neg_range) for item in items]
for offset in int_range:
correct, compared = compare_sequences(d1, d2, offset, indices)
frac_correct = 1.0*correct/compared
if frac_correct > max_frac_correct:
max_frac_correct = frac_correct
best_offset = offset
best_compared = compared
best_correct = correct
if frac_correct > correct_cutoff:
break
return max_frac_correct, best_compared, best_offset, indices
if __name__ == "__main__":
import doctest
doctest.testmod()
|