zeek/src/SmithWaterman.h
2020-07-02 17:24:22 -07:00

153 lines
4.6 KiB
C++

// See the file "COPYING" in the main distribution directory for copyright.
#pragma once
#include "ZeekString.h"
#include <map>
// BroSubstrings are essentially BroStrings, augmented with indexing
// information required for the Smith-Waterman algorithm. Each substring
// can be marked as being a common substring of arbitrarily many strings,
// for each of which we store where the substring starts.
//
//
class BroSubstring : public zeek::String {
public:
typedef std::vector<BroSubstring*> Vec;
typedef Vec::iterator VecIt;
typedef Vec::const_iterator VecCIt;
// An alignment to another string.
//
struct BSSAlign {
BSSAlign(const zeek::String* string, int index)
{ this->string = string; this->index = index; }
// The other string
//
const zeek::String* string;
// Offset in the string that substring
// starts at, counting from 0.
//
int index;
};
typedef std::vector<BSSAlign> BSSAlignVec;
typedef BSSAlignVec::iterator BSSAlignVecIt;
typedef BSSAlignVec::const_iterator BSSAlignVecCIt;
explicit BroSubstring(const std::string& string)
: zeek::String(string), _num(), _new(false) { }
explicit BroSubstring(const zeek::String& string)
: zeek::String(string), _num(), _new(false) { }
BroSubstring(const BroSubstring& bst);
const BroSubstring& operator=(const BroSubstring& bst);
// Returns true if this string completely covers the given one.
// "Covering" means that the substring must be at least as long
// as the one compared to, and completely covers the range occupied
// by the given one.
//
bool DoesCover(const BroSubstring* bst) const;
void AddAlignment(const zeek::String* string, int index);
const BSSAlignVec& GetAlignments() const { return _aligns; }
unsigned int GetNumAlignments() const { return _aligns.size(); }
void SetNum(int num) { _num = num; }
int GetNum() const { return _num; }
void MarkNewAlignment(bool mark) { _new = mark; }
bool IsNewAlignment() { return _new; }
// Helper methods for vectors:
//
static zeek::VectorVal* VecToPolicy(Vec* vec);
static Vec* VecFromPolicy(zeek::VectorVal* vec);
static char* VecToString(Vec* vec);
static zeek::String::IdxVec* GetOffsetsVec(const Vec* vec,
unsigned int index);
private:
typedef std::map<std::string, void*> DataMap;
typedef DataMap::iterator DataMapIt;
BroSubstring();
// The alignments registered for this substring.
BSSAlignVec _aligns;
// Every substring can have a numerical label.
int _num;
// True if this node marks the start of a new alignment.
bool _new;
};
// A comparison class that sorts BroSubstrings according to the string
// offset value of the nth input string, where "nth" starts from 0.
//
class BroSubstringCmp {
public:
explicit BroSubstringCmp(unsigned int index) { _index = index; }
bool operator()(const BroSubstring* bst1, const BroSubstring* bst2) const;
private:
unsigned int _index;
};
// Smith-Waterman Implementation
// ---------------------------------------------------------------------
//
// We support two modes of operation: finding a single optimal alignment,
// and repeated alignments.
//
enum SWVariant {
SW_SINGLE = 0, // return a single, optimum alignment
SW_MULTIPLE = 1, // find repeated, non-overlapping alignments
};
// Parameters for Smith-Waterman are stored in this simple record.
//
struct SWParams {
explicit SWParams(unsigned int min_toklen = 3, SWVariant sw_variant = SW_SINGLE)
{
_min_toklen = min_toklen;
_sw_variant = sw_variant;
}
// The minimum string size to report. For example, min_toklen = 2
// won't report any common single-letter subsequences.
unsigned int _min_toklen;
SWVariant _sw_variant;
};
// The smith_waterman() algorithm finds the longest common subsequence(s)
// of two strings, also known as the best local alignment. A subsequence
// is a sequence of common substrings.
//
// s1: first input string
// s2: second input string
// params: Smith-Waterman parameters.
//
// Subsequences of a string are any strings based on the original one
// with individual characters left out. Note that this is different
// from the longest common substring problem.
//
// The function returns a vector consisting of all substrings comprising
// the subsequence. With each string you also get the indices of both
// input strings where the string occurs. On error, or if no common
// subsequence exists, an empty vector is returned.
//
extern BroSubstring::Vec* smith_waterman(const zeek::String* s1,
const zeek::String* s2,
SWParams& params);