C# Array Subset fetching - c#

I have an array of bytes and i want to determine if the contents of this array of bytes exists within another larger array as a continuous sequence. What is the simplest way to go about doing this?

The naive approach is:
public static bool IsSubsetOf(byte[] set, byte[] subset) {
for(int i = 0; i < set.Length && i + subset.Length <= set.Length; ++i)
if (set.Skip(i).Take(subset.Length).SequenceEqual(subset))
return true;
return false;
}
For more efficient approaches, you might consider more advanced string matching algorithms like KMP.

Try to adapt some string search algorithm. One of the fastest is Boyer-Moore . It's quite easy as well. For binary data, Knuth-Morris-Pratt algorithm might work very efficiently as well.

This, which is a 1/1 port of this answer: Searching for a sequence of Bytes in a Binary File with Java
Is a very efficient way of doing so:
public static class KmpSearch {
public static int IndexOf(byte[] data, byte[] pattern) {
int[] failure = ComputeFailure(pattern);
int j = 0;
if (data.Length == 0) return -1;
for (int i = 0; i < data.Length; i++) {
while (j > 0 && pattern[j] != data[i]) {
j = failure[j - 1];
}
if (pattern[j] == data[i]) { j++; }
if (j == pattern.Length) {
return i - pattern.Length + 1;
}
}
return -1;
}
private static int[] ComputeFailure(byte[] pattern) {
int[] failure = new int[pattern.Length];
int j = 0;
for (int i = 1; i < pattern.Length; i++) {
while (j > 0 && pattern[j] != pattern[i]) {
j = failure[j - 1];
}
if (pattern[j] == pattern[i]) {
j++;
}
failure[i] = j;
}
return failure;
}
}

Related

Bitwise operators on different length BitArrays

I have 2 BitArray items and I need to know if any of bits are the same in each, "AND".
However, the length of the BitArrays can be different and ether one can be larger or smaller than the other.
How can I do an "AND" of two BitArrays, without getting an exception because of different sizes?
This is going to happen a lot, so I need it to be fairly quick.
Example
int[] ids = new int[3];
ids[0] = 1;
ids[1] = 3;
ids[2] = 5;
BitArray bs1 = new BitArray(ids.Max()+1);
for (int i = 0; i < ids.Count(); ++i)
{
bs1[ids[i]] = true;
}
ids[0] = 1;
ids[1] = 59;
ids[2] = 1111;
BitArray bs2 = new BitArray(ids.Max()+1);
for (int i = 0; i < ids.Count(); ++i)
{
bs2[ids[i]] = true;
}
ids[0] = 0;
ids[1] = 5;
ids[2] = 33;
BitArray bs3 = new BitArray(ids.Max()+1);
for (int i = 0; i < ids.Count(); ++i)
{
bs3[ids[i]] = true;
}
//if bs1 AND bs2 bitcount > 0 DisplayMessage("1 and 2 has some same items")
//if bs1 AND bs3 bitcount > 0 DisplayMessage("1 and 3 has some same items")
//if bs2 AND bs3 bitcount > 0 DisplayMessage("2 and 3 has some same items")
To solve my problem I modified the BitArray code and added the following
public static MyBitArray TruncateCopy(MyBitArray source, int size)
{
MyBitArray dest = new MyBitArray(size);
//copy all the arrays
for (int i = 0; i < dest.m_array.Length; ++i)
{
dest.m_array[i] = source.m_array[i];
}
//remove any of the items over the given size
for (int i = ((size % 32) + 1); i < 32; ++i)
{
dest.m_array[i >> 5] &= ~(1 << (i & 31));
}
return dest;
}
public bool HasCommonBits(MyBitArray comp)
{
MyBitArray copied, other;
if (this.Length < comp.Length)
{
other = this;
copied = TruncateCopy(comp, this.Length);
}
else
{
copied = TruncateCopy(this, comp.Length);
other = comp;
}
MyBitArray compareEq = copied.And(other);
return (!compareEq.IsEmpty());
}
public bool IsEmpty()
{
for (int i = 0; i < this.m_array.Length; ++i)
{
if (m_array[i] != 0)
return false;
}
return true;
}
public bool IsFull()
{
//run through all the full sets
for (int i = 0; i < this.m_array.Length - 1; ++i)
{
if (m_array[i] != -1) //-1 is all bits set in an integer
return false;
}
//go through the partial one
for (int i = 0; i < (this.Length % 32); ++i)
{
if (!this[i])
return false;
}
return true;
}
}
First, define what you want to happen in case of differing lengths. Maybe you just want to compare the first Math.Min(len1, len2) elements. In that case write a for loop whose index variable ranges from 0 to Math.Min(len1, len2). Compare the respective array elements in the loop body.
I examined BitArray with reflector. There is no way to trim it, or to perform a partial And. You're out of luck with this class. Replace it with a custom-written class that supports what you need. Writing a bit array is not especially hard.
Completely revised based on this comment:
The result bitarray of your example would be 01010. My original problem states that I need to see if any of the bits are the same. Thus the a resulting bitarray with any 1's would be True and all 0's would be False
BitArrray truncateCopyBA(BitArray source, int size)
{
BitArray dest = new BitArray(size);
for(int i = 0; i < size; ++i)
{
dest[i] = source[i];
}
return dest;
}
bool YourFunc(BitArray a, BitArray b)
{
BitArray one, two;
if (a.Length < b.Length)
{
one = a;
two = truncateCopyBA(b, a.Length);
}
else
{
one = truncateCopyBA(a, b.Length);
two = b;
// If you want to see which bits in both arrays are both ones, then use .And()
// If you want to see which bits in both arrays are the same, use .Not(.Xor()).
BitArray compareEq = a.And(b);
bool anyBitsSame=false;
for(int i = 0; i < compareEq.Length; ++i)
{
if(compareEq.Get(i))
{
return true;
}
}
return false
}
}
I believe this is what you're looking for, but honestly your question is still quite vague after clarifications.

Search for an Array or List in a List

Have
List<byte> lbyte
Have
byte[] searchBytes
How can I search lbyte for not just a single byte but for the index of the searchBytes?
E.G.
Int32 index = lbyte.FirstIndexOf(searchBytes);
Here is the brute force I came up with.
Not the performance I am looking for.
public static Int32 ListIndexOfArray(List<byte> lb, byte[] sbs)
{
if (sbs == null) return -1;
if (sbs.Length == 0) return -1;
if (sbs.Length > 8) return -1;
if (sbs.Length == 1) return lb.FirstOrDefault(x => x == sbs[0]);
Int32 sbsLen = sbs.Length;
Int32 sbsCurMatch = 0;
for (int i = 0; i < lb.Count; i++)
{
if (lb[i] == sbs[sbsCurMatch])
{
sbsCurMatch++;
if (sbsCurMatch == sbsLen)
{
//int index = lb.FindIndex(e => sbs.All(f => f.Equals(e))); // fails to find a match
IndexOfArray = i - sbsLen + 1;
return;
}
}
else
{
sbsCurMatch = 0;
}
}
return -1;
}
Brute force is always an option. Although slow in comparison to some other methods, in practice it's usually not too bad. It's easy to implement and quite acceptable if lbyte isn't huge and doesn't have pathological data.
It's the same concept as brute force string searching.
You may find Boyer-Moore algorithm useful here. Convert your list to an array and search. The algorithm code is taken from this post.
static int SimpleBoyerMooreSearch(byte[] haystack, byte[] needle)
{
int[] lookup = new int[256];
for (int i = 0; i < lookup.Length; i++) { lookup[i] = needle.Length; }
for (int i = 0; i < needle.Length; i++)
{
lookup[needle[i]] = needle.Length - i - 1;
}
int index = needle.Length - 1;
var lastByte = needle.Last();
while (index < haystack.Length)
{
var checkByte = haystack[index];
if (haystack[index] == lastByte)
{
bool found = true;
for (int j = needle.Length - 2; j >= 0; j--)
{
if (haystack[index - needle.Length + j + 1] != needle[j])
{
found = false;
break;
}
}
if (found)
return index - needle.Length + 1;
else
index++;
}
else
{
index += lookup[checkByte];
}
}
return -1;
}
You can then search like this. If lbyte will remain constant after a certain time, you can just convert it to an array once and pass that.
//index is returned, or -1 if 'searchBytes' is not found
int startIndex = SimpleBoyerMooreSearch(lbyte.ToArray(), searchBytes);
Update based on comment. Here's the IList implementation which means that arrays and lists (and anything else that implements IList can be passed)
static int SimpleBoyerMooreSearch(IList<byte> haystack, IList<byte> needle)
{
int[] lookup = new int[256];
for (int i = 0; i < lookup.Length; i++) { lookup[i] = needle.Count; }
for (int i = 0; i < needle.Count; i++)
{
lookup[needle[i]] = needle.Count - i - 1;
}
int index = needle.Count - 1;
var lastByte = needle[index];
while (index < haystack.Count)
{
var checkByte = haystack[index];
if (haystack[index] == lastByte)
{
bool found = true;
for (int j = needle.Count - 2; j >= 0; j--)
{
if (haystack[index - needle.Count + j + 1] != needle[j])
{
found = false;
break;
}
}
if (found)
return index - needle.Count + 1;
else
index++;
}
else
{
index += lookup[checkByte];
}
}
return -1;
}
Since arrays and lists implement IList, there's no conversion necessary when calling it in your case.
int startIndex = SimpleBoyerMooreSearch(lbyte, searchBytes);
Another way you could do with lambda expression
int index = lbyte.FindIndex(e => searchBytes.All(i => i.Equals(e));

Search longest pattern in byte array in C#

I need to write effective and quick method to search byte array for given pattern.
I write it this way, what do you think , how to improve? And it has one bug, it cannot return match with length 1.
public static bool SearchByteByByte(byte[] bytes, byte[] pattern)
{
bool found = false;
int matchedBytes = 0;
for (int i = 0; i < bytes.Length; i++)
{
if (pattern[0] == bytes[i] && bytes.Length - i >= pattern.Length)
{
for (int j = 1; j < pattern.Length; j++)
{
if (bytes[i + j] == pattern[j])
{
matchedBytes++;
if (matchedBytes == pattern.Length - 1)
{
return true;
}
continue;
}
else
{
matchedBytes = 0;
break;
}
}
}
}
return found;
}
Any suggestions ?
The Boyer-Moore algorithm that is used in grep is pretty efficient, and gets more efficient for longer pattern sizes. I'm pretty sure you could make it work for a byte array without too much difficulty, and its wikipedia page has an implementation in Java that should be fairly easy to port to C#.
UPDATE:
Here's an implementation of a simplified version of the Boyer-Moore algorithm for byte arrays in C#. It only uses the second jump table of the full algorithm. Based on the array sizes that you said (haystack: 2000000 bytes, needle: 10 bytes), it's about 5-8 times faster than a simple byte by byte algorithm.
static int SimpleBoyerMooreSearch(byte[] haystack, byte[] needle)
{
int[] lookup = new int[256];
for (int i = 0; i < lookup.Length; i++) { lookup[i] = needle.Length; }
for (int i = 0; i < needle.Length; i++)
{
lookup[needle[i]] = needle.Length - i - 1;
}
int index = needle.Length - 1;
var lastByte = needle.Last();
while (index < haystack.Length)
{
var checkByte = haystack[index];
if (haystack[index] == lastByte)
{
bool found = true;
for (int j = needle.Length - 2; j >= 0; j--)
{
if (haystack[index - needle.Length + j + 1] != needle[j])
{
found = false;
break;
}
}
if (found)
return index - needle.Length + 1;
else
index++;
}
else
{
index += lookup[checkByte];
}
}
return -1;
}
And it has one bug, it cannot return match with length 1
To fix this, start inner loop from zero:
public static bool SearchByteByByte(byte[] bytes, byte[] pattern)
{
bool found = false;
int matchedBytes = 0;
for (int i = 0; i < bytes.Length; i++)
{
if (pattern[0] == bytes[i] && bytes.Length - i >= pattern.Length)
{
for (int j = 0; j < pattern.Length; j++) // start from 0
{
if (bytes[i + j] == pattern[j])
{
matchedBytes++;
if (matchedBytes == pattern.Length) // remove - 1
return true;
continue;
}
else
{
matchedBytes = 0;
break;
}
}
}
}
return found;
}
UPDATE: Here is your searching algorithm after flattering and removing local variables (they are not needed)
public static bool SearchByteByByte(byte[] bytes, byte[] pattern)
{
for (int i = 0; i < bytes.Length; i++)
{
if (bytes.Length - i < pattern.Length)
return false;
if (pattern[0] != bytes[i])
continue;
for (int j = 0; j < pattern.Length; j++)
{
if (bytes[i + j] != pattern[j])
break;
if (j == pattern.Length - 1)
return true;
}
}
return false;
}
So you're looking, effectively, for the longest common substring, so see the Wikipedia article on that: http://en.wikipedia.org/wiki/Longest_common_substring_problem
... or even a reference implementation: http://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_substring#C.23 -- you will, of course, have to substitute byte[] for string there, etc.

Find an array (byte[]) inside another array?

What is the simplest way to find a byte[] inside another byte[]? i have a feeling i could do it with linq but i dont know how.
Note: I did a search with the [c#] and didnt find anything, i am surprised.
Here's a faster version of Ergwun's excellent answer:
static int SearchBytes( byte[] haystack, byte[] needle ) {
var len = needle.Length;
var limit = haystack.Length - len;
for( var i = 0; i <= limit; i++ ) {
var k = 0;
for( ; k < len; k++ ) {
if( needle[k] != haystack[i+k] ) break;
}
if( k == len ) return i;
}
return -1;
}
In a brief test with an 11MB haystack and 9 byte needle, this was about three times faster.
The optimizations are:
No function call each time through the outer loop.
Needle length and search limit are cached.
Redundant length test at the beginning of match() is removed.
Of course for long byte arrays you'd want to use something like a Boyer-Moore search, but for many purposes a simple algorithm like this is good enough, and it has the virtue of being short and easy to understand and verify.
Here's a simple (naive?) way to do it:
static int search(byte[] haystack, byte[] needle)
{
for (int i = 0; i <= haystack.Length - needle.Length; i++)
{
if (match(haystack, needle, i))
{
return i;
}
}
return -1;
}
static bool match(byte[] haystack, byte[] needle, int start)
{
if (needle.Length + start > haystack.Length)
{
return false;
}
else
{
for (int i = 0; i < needle.Length; i++)
{
if (needle[i] != haystack[i + start])
{
return false;
}
}
return true;
}
}
Try this one with using lambda expressions:
private bool CheckPatternInArray(byte[] array, byte[] pattern)
{
int fidx = 0;
int result = Array.FindIndex(array, 0, array.Length, (byte b) =>
{
fidx = (b == pattern[fidx]) ? fidx + 1 : 0;
return (fidx == pattern.Length);
});
return (result >= pattern.Length - 1);
}
If you are after the fastest one, check solutions here.
you probably could have figured this yourself but sometimes I like to do the simple thing.
bool found = false;
int i = 0;
for(; i < byteArray.Length || found; i++)
{
if(byteArray[i] == lookingFor)
{
found = true;
}
}

How to find substring from string without using indexof method in C#?

I want to find the position of a substring in a string if present without using any string method including indexof. I tried so much times but failed. Will anybody tell me how to do in C#? We can use .Length operator.
Sorry.. thought this would be a fun exercise for me, so...
Spoiler
class Program
{
static void Main(string[] args)
{
string str = "abcdefg";
string substr = "cde";
int index = IndexOf(str, substr);
Console.WriteLine(index);
Console.ReadLine();
}
private static int IndexOf(string str, string substr)
{
bool match;
for (int i = 0; i < str.Length - substr.Length + 1; ++i)
{
match = true;
for (int j = 0; j < substr.Length; ++j)
{
if (str[i + j] != substr[j])
{
match = false;
break;
}
}
if (match) return i;
}
return -1;
}
}
Assuming this is homework, my suggestion is to bear in mind that a string is an IEnumerable of chars. So you can loop through the characters in your string...
Since any homework that inspired the question is well past due, here's a stab at a reasonably performant answer.
Simply cycling through the larger string, and cycling through the substring comparing each character as one goes takes Θ((n-m+1) m) time where m is the length of the substring, and n the index where the smaller string is found, or if there is no match the length of the larger minus that of the smaller.
There are a few different algorithm that give better performance, which differ among themselves in terms of which cases they work best in. The Knuth-Morris-Pratt algorithm takes Θ(m) to set up and then Θ(n) time to find, because it first creates a table to know how far ahead it can jump on failing to find a match, and on balance this makes for a quicker search.
Consider that if we were looking for "ababcd" and we'd first found "abab…" (possible match so far), if the next character is c we still have a possible match. If it's a we don't have a match, but should jump forward two characters to start looking for a match starting from that. If it's anything else, we should jump ahead five characters and continue looking for there. Preparing the table to tell us how far to jump makes things much faster from then on:
public static int IndexOf(string haystack, string needle)
{
if(haystack == null || needle == null)
throw new ArgumentNullException();
if(needle.Length == 0)
return 0;//empty strings are everywhere!
if(needle.Length == 1)//can't beat just spinning through for it
{
char c = needle[0];
for(int idx = 0; idx != haystack.Length; ++idx)
if(haystack[idx] == c)
return idx;
return -1;
}
if (needle.Length == haystack.Length) return needle == haystack ? 0 : -1;
if (needle.Length < haystack.Length)
{
int m = 0;
int i = 0;
int[] T = KMPTable(needle);
while(m + i < haystack.Length)
{
if(needle[i] == haystack[m + i])
{
if(i == needle.Length - 1)
return m == haystack.Length ? -1 : m;//match -1 = failure to find conventional in .NET
++i;
}
else
{
m = m + i - T[i];
i = T[i] > -1 ? T[i] : 0;
}
}
}
return -1;
}
private static int[] KMPTable(string sought)
{
int[] table = new int[sought.Length];
int pos = 2;
int cnd = 0;
table[0] = -1;
table[1] = 0;
while(pos < table.Length)
if(sought[pos - 1] == sought[cnd])
table[pos++] = ++cnd;
else if(cnd > 0)
cnd = table[cnd];
else
table[pos++] = 0;
return table;
}
Try this:
internal bool SearchWord(string str, string searchKey)
{
int j = 0; bool result = false;
for (int i = 0; i < str.Length; i++)
{
if (searchKey[j] == str[i])
{
j++; //count++;
}
else { j = 0; }
if (j == searchKey.Length)
{
result = true;
break;
}
}
return result;
}
Try this:
public static string BetweenOf(string ActualStr, string StrFirst, string StrLast)
{
return ActualStr.Substring(ActualStr.IndexOf(StrFirst) + StrFirst.Length,
(ActualStr.Substring(ActualStr.IndexOf(StrFirst))).IndexOf(StrLast) + StrLast.Length);
}
string mainString = Console.ReadLine();
string subString = Console.ReadLine();
for (int i = 0; i <= mainString.Length - subString.Length; i++)
{
bool match = true;
for (int j = 0; j < subString.Length && mainString[i + j] != subString[j]; j++)
{
match = false;
}
if (match)
Console.WriteLine(i);
}
public static findindex(String str,String substr)
{
char a[]=str.toCharArray();
char b[]=substr.toCharArray();
int j=0,t=0;
for(int i=0;i<str.length()&&j<substr.length();i++)
{
if(a[i]==b[j])
{
t=i;
j++;
}
else
continue;
}
if(t==0)
return -1;
else
return t-substr.length()+1;
}//in java

Categories