mirror of
https://github.com/zeek/zeek.git
synced 2025-10-02 06:38:20 +00:00
148 lines
4.4 KiB
C++
148 lines
4.4 KiB
C++
// See the file "COPYING" in the main distribution directory for copyright.
|
|
|
|
#pragma once
|
|
|
|
#include <map>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include "zeek/ZeekString.h"
|
|
|
|
namespace zeek::detail {
|
|
|
|
/**
|
|
* Substrings are essentially Strings, augmented with indexing information
|
|
* required for the Smith-Waterman algorithm. Each substring can be
|
|
* marked as being a common substring of arbitrarily many strings, for each
|
|
* of which we store where the substring starts.
|
|
*/
|
|
class Substring : public String {
|
|
public:
|
|
using Vec = std::vector<Substring*>;
|
|
|
|
// An alignment to another string.
|
|
//
|
|
struct BSSAlign {
|
|
BSSAlign(const String* string, int index) {
|
|
this->string = string;
|
|
this->index = index;
|
|
}
|
|
|
|
// The other string
|
|
//
|
|
const String* string;
|
|
|
|
// Offset in the string that substring
|
|
// starts at, counting from 0.
|
|
//
|
|
int index;
|
|
};
|
|
|
|
using BSSAlignVec = std::vector<BSSAlign>;
|
|
|
|
Substring() = delete;
|
|
|
|
explicit Substring(const std::string& string) : String(string), _num(), _new(false) {}
|
|
|
|
explicit Substring(const String& string) : String(string), _num(), _new(false) {}
|
|
|
|
Substring(const Substring& bst);
|
|
|
|
const Substring& operator=(const Substring& bst);
|
|
|
|
// Returns true if this string completely covers the given one.
|
|
// "Covering" means that the substring must be at least as long
|
|
// as the one compared to, and completely covers the range occupied
|
|
// by the given one.
|
|
//
|
|
bool DoesCover(const Substring* bst) const;
|
|
|
|
void AddAlignment(const String* string, int index);
|
|
const BSSAlignVec& GetAlignments() const { return _aligns; }
|
|
unsigned int GetNumAlignments() const { return _aligns.size(); }
|
|
|
|
void SetNum(int num) { _num = num; }
|
|
int GetNum() const { return _num; }
|
|
|
|
void MarkNewAlignment(bool mark) { _new = mark; }
|
|
bool IsNewAlignment() { return _new; }
|
|
|
|
// Helper methods for vectors:
|
|
//
|
|
static VectorVal* VecToPolicy(Vec* vec);
|
|
static Vec* VecFromPolicy(VectorVal* vec);
|
|
static char* VecToString(Vec* vec);
|
|
static String::IdxVec* GetOffsetsVec(const Vec* vec, unsigned int index);
|
|
|
|
private:
|
|
using DataMap = std::map<std::string, void*>;
|
|
|
|
// The alignments registered for this substring.
|
|
BSSAlignVec _aligns;
|
|
|
|
// Every substring can have a numerical label.
|
|
int _num;
|
|
|
|
// True if this node marks the start of a new alignment.
|
|
bool _new;
|
|
};
|
|
|
|
// A comparison class that sorts Substrings according to the string
|
|
// offset value of the nth input string, where "nth" starts from 0.
|
|
//
|
|
class SubstringCmp {
|
|
public:
|
|
explicit SubstringCmp(unsigned int index) { _index = index; }
|
|
bool operator()(const Substring* bst1, const Substring* bst2) const;
|
|
|
|
private:
|
|
unsigned int _index;
|
|
};
|
|
|
|
// Smith-Waterman Implementation
|
|
// ---------------------------------------------------------------------
|
|
//
|
|
|
|
// We support two modes of operation: finding a single optimal alignment,
|
|
// and repeated alignments.
|
|
//
|
|
enum SWVariant : uint8_t {
|
|
SW_SINGLE = 0, // return a single, optimum alignment
|
|
SW_MULTIPLE = 1, // find repeated, non-overlapping alignments
|
|
};
|
|
|
|
// Parameters for Smith-Waterman are stored in this simple record.
|
|
//
|
|
struct SWParams {
|
|
explicit SWParams(unsigned int min_toklen = 3, SWVariant sw_variant = SW_SINGLE) {
|
|
_min_toklen = min_toklen;
|
|
_sw_variant = sw_variant;
|
|
}
|
|
|
|
// The minimum string size to report. For example, min_toklen = 2
|
|
// won't report any common single-letter subsequences.
|
|
unsigned int _min_toklen;
|
|
|
|
SWVariant _sw_variant;
|
|
};
|
|
|
|
// The smith_waterman() algorithm finds the longest common subsequence(s)
|
|
// of two strings, also known as the best local alignment. A subsequence
|
|
// is a sequence of common substrings.
|
|
//
|
|
// s1: first input string
|
|
// s2: second input string
|
|
// params: Smith-Waterman parameters.
|
|
//
|
|
// Subsequences of a string are any strings based on the original one
|
|
// with individual characters left out. Note that this is different
|
|
// from the longest common substring problem.
|
|
//
|
|
// The function returns a vector consisting of all substrings comprising
|
|
// the subsequence. With each string you also get the indices of both
|
|
// input strings where the string occurs. On error, or if no common
|
|
// subsequence exists, an empty vector is returned.
|
|
//
|
|
extern Substring::Vec* smith_waterman(const String* s1, const String* s2, SWParams& params);
|
|
|
|
} // namespace zeek::detail
|