Parsing data from nested brackets in C# - c#

Finally solved this myself after making many styles of parsers suffering from the same bug...
Working (cleaner but still needs more cleaning) code Here
if(input > 47 & input < 123) should be
if(input > 44 & input < 123) to include the '.'
thanks for no help.
Modifying This example code almost works for me, and is a starting point to learn more.
I've been trying to parse some files from the game Europa Universalis IV.
The files contain text like this:
Example 1
Example 2
In case someone wants to help debug Here is a browser version with 2 processed OuterEntities as example.
Once everything works the data would be put into a database after every object is finished. For testing I just write my progress to a text file.
I have tried this a couple times in different ways that almost always end up looking the same. By now the process is split into parts, and most of them work.
I start by collecting all the "OuterEntities" (First level brackets) in each of the GameFiles (just one for fast testing), this step is easy.
I put those outer entities into a list in a GameFile object (ToString overrides are just for testing) :
GameFile
internal record GameFile
{
public List<string> UnprocessedOuterEntites = new();
public string Name;
public string Path;
public List<OuterEntity> OuterEntities = new();
public override string ToString()
{
StringBuilder sb = new();
sb.AppendLine(Name);
sb.AppendLine(Path);
foreach(OuterEntity entity in OuterEntities)
{
sb.AppendLine(entity.ToString());
}
foreach(string entity in UnprocessedOuterEntites)
{
sb.AppendLine(entity);
}
return sb.ToString();
}
OuterEntity
internal class OuterEntity
{
public List<string> UnprocessedInnerEntities = new();
public string? Name = "";
public List<InnerEntity> InnerEntities = new();
public void Clear()
{
this.UnprocessedInnerEntities.Clear();
this.InnerEntities.Clear();
this.Name = null;
}
public override string ToString()
{
StringBuilder sb = new();
sb.Append(Name);
foreach (InnerEntity entity in InnerEntities)
{
sb.Append(entity.ToString());
}
foreach(string entity in UnprocessedInnerEntities)
{
sb.Append(entity);
}
return sb.ToString();
}
}
Those OuterEntities contain InnerEntities, which contain values or more InnerEntities (Nested brackets) :
Inner Entity
internal class InnerEntity
{
public List<string> UnprocessedInnerEntities = new();
public string? Name;
public string? Value;
public List<InnerEntity> InnerEntities = new();
public void Clear()
{
this.Name = null;
this.Value = null;
this.InnerEntities = new();
this.UnprocessedInnerEntities = new();
}
public override string ToString()
{
StringBuilder sb = new StringBuilder();
sb.AppendLine(Name);
if (InnerEntities.Count > 0)
{
foreach (InnerEntity entity in InnerEntities)
{
sb.Append(entity.ToString());
}
}
else
{
sb.AppendLine(Value);
}
foreach (string String in UnprocessedInnerEntities)
{
sb.AppendLine(String);
}
return sb.ToString();
}
The program is (for testing) run like this:
internal class Program{
private static Program program = new();
public List<GameFile> gameFiles = new();
public static void Main()
{
program.Run();
}
private void Run()
{
string[] paths = Directory.GetFiles("Source", "*.txt", SearchOption.AllDirectories);
foreach (string path in paths)
{
Process.GetOuterEntities(path,this);
}
Process.ProcessOuterEntities(this);
Process.ProcessInnerEntities(this);
Test.PrintOuterEntities(this);
}}
The OuterEntities are collected and processed:
public static void GetOuterEntities(string path, Program program)
{
string[] lines = System.IO.File.ReadAllLines(path);
GameFile file = new GameFile() { Path = path };
StringBuilder sb = new StringBuilder();
int currentPos = 0;
int lastPos = 0;
int brackets = 0;
bool inBrackets = false;
string st = RemoveComments(lines);
OuterEntity entity = new OuterEntity();
foreach (char c in st)
{
if (c == '{')
{
inBrackets = true;
brackets++;
}
else if (c == '}')
{
brackets--;
}
if (brackets == 0 && inBrackets)//End Outer
{
sb.Clear();
inBrackets = false;
for (int i = lastPos; i <= currentPos; i++)
{
if (st[i] != '\n')
{
sb.Append(st[i]);
}
}
file.UnprocessedOuterEntites.Add(sb.ToString());
program.gameFiles.Add(file);
lastPos = currentPos + 1;
}
currentPos++;
}
public static void ProcessOuterEntities(Program program)
{
foreach (GameFile file in program.gameFiles)
{
foreach (string UnprocessedOuterEntity in file.UnprocessedOuterEntites)
{
file.OuterEntities.Add(ProcessOuterEntity(UnprocessedOuterEntity));
}
file.UnprocessedOuterEntites.Clear();
}
}
private static OuterEntity ProcessOuterEntity(string UnprocessedOuterEntity)
{
OuterEntity processedOuterEntity = new();
int currentPos = 0;
int lastPos = 0;
int brackets = 0;
bool inBrackets = false;
StringBuilder sb = new StringBuilder();
foreach (char c in UnprocessedOuterEntity)
{
if (c == '{')
{
if (processedOuterEntity.Name == "")
{
for (int i = lastPos; i <= currentPos; i++)
{
if (UnprocessedOuterEntity[i] != '\n')
{
sb.Append(UnprocessedOuterEntity[i]);
}
}
processedOuterEntity.Name = sb.ToString();
sb.Clear();
lastPos = currentPos + 1;
}
inBrackets = true;
brackets++;
}
else if (c == '}')
{
brackets--;
}
if (brackets == 0 && inBrackets)
{
inBrackets = false;
for (int i = lastPos; i <= currentPos; i++)
{
sb.Append(UnprocessedOuterEntity[i]);
}
processedOuterEntity.UnprocessedInnerEntities.Add(sb.ToString());
sb.Clear();
lastPos = currentPos + 1;
}
currentPos++;
}
return processedOuterEntity;
}
After this I fail to process the inner entities.
This is the currently not working part:
private static InnerEntity ProcessInnerEntity(string UnprocessedInnerEntity, bool WaitingForValue = false)
{
InnerEntity processedInnerEntity = new();
int currentPos = 0;
int lastPos = 0;
int brackets = 0;
bool inBrackets = false;
bool waitingForValue = WaitingForValue;
StringBuilder sb = new StringBuilder();
foreach (char c in UnprocessedInnerEntity)
{
if (c == '{')//Brackets opening
{
inBrackets = true;
brackets++;
}
else if (c == '}')//Brackets closing
{
brackets--;
}
else if (c == '=')//Name behind, Value or NestedEntity in front
{
for (int i = lastPos; i <= currentPos; i++)
{
sb.Append(UnprocessedInnerEntity[i]);
}
processedInnerEntity.Name = sb.ToString();
sb.Clear();
lastPos = currentPos + 1;
waitingForValue = true;
if (inBrackets)//In a nested bracket, find InnerEntity
{
processedInnerEntity.InnerEntities.Add(ProcessInnerEntity(UnprocessedInnerEntity.Substring(currentPos + 1), waitingForValue));//Search everything in front
}
}
else if (c<47 | c > 123)//Not text
{
if (waitingForValue)//After =
{
for (int i = lastPos; i <= currentPos; i++)
{
if (c > 47 & c < 123)//Text
{
waitingForValue = false;
sb.Append(UnprocessedInnerEntity[i]);
}
}
if (!waitingForValue)//Has value
{
processedInnerEntity.Value = sb.ToString();
sb.Clear();
}
}
}
if (brackets == 0 && inBrackets)//Brackets closed
{
inBrackets = false;
currentPos++;
break;
}
currentPos++;
}
return processedInnerEntity;
}
The rest of the code can be found here
The debug file is written like this
internal class Test
{
public static void PrintOuterEntities(Program program)
{
List<string> strings = new();
foreach (GameFile gameFile in program.gameFiles)
{
strings.Add(gameFile.ToString());
}
foreach (string String in strings)
{
File.WriteAllText("test.txt", String);
}
}
}
Current output looks like this:
source\common\ideas\00_country_ideas.txt
HLR_ideas = { 1 diplomatic_reputation =
2 } bonus =
2 } bonus =
GER_ideas = { 0.3 infantry_power =
0.15 } bonus =
0.15 } bonus =
0.15 } bonus =
0.15 } bonus =
I want a list of OuterEntities containing their InnerEntities like this:
OuterEntity //Object
Name HLR_Ideas
InnerEntities //List
Name start
InnerEntities
Name possible_policy
Value 1
Name diplomatic_reputation
Value 2
Name bonus
InnerEntities
Name administrative_efficiency
Value 0.05
Name trigger
InnerEntities
Name tag
Value HLR
Name free
Value yes
Name hlr_imperial_throne
InnerEntities
Name legitimacy
Value 1.5
Name hlr_kaiserliche_armee
InnerEntities
Name land_morale
Value 0.15
Name hlr_imperial_diplomacy
InnerEntities
Name improve_relation_modifier
Value 0.33
Name hlr_centralised_empire
InnerEntities
Name global_tax_modifier
Value 0.2
Name hlr_roman_heritage
InnerEntities
Name core_creation
Value -0.2
Name hlr_adopting_the_goosestep
InnerEntities
Name discipline
Value 0.05
Name hlr_onwards_and_upwards
InnerEntities
Name governing_capacity_modifier
Value 0.1
Is there something obvious I'm doing wrong in the last part? Are there existing wheels I don't know about I'm trying to reinvent? Do I just learn Perl or the fancy non regular regular expressions I see out there?

Related

how to scramble words in sentence using C# and put in into array

i need help for fix my code from the lesson i get, i try to create simple script for scramble words in sentences like "the house is broken" became "broken the is house"..my code work as well but it scramble only by one word, like "THE" became "H.T.E",
i try to use string.split method but i dont understand where i must change the code was into array.
here my code and the result is
using UnityEngine;
using System.Collections;
using System.Collections.Generic;
using UnityEngine.UI;
[System.Serializable]
public class Word
{
public string word;
[Header("biarkan kosong untuk acak otomatis")]
public string desiredRandom;
public string GetString()
{
if (!string.IsNullOrEmpty(desiredRandom))
{
return desiredRandom;
}
string result = word;
// **I'm try to split string here where i try to input that into array**
string[] array = result.Split(' ');
foreach (string token in array)
{
Debug.Log((token).ToString());
word = token;
}
result = "";
List<char> characters = new List<char>(word.ToCharArray());
while (characters.Count > 0)
{
int indexChar = Random.Range(0, characters.Count - 1);
result += characters[indexChar];
characters.RemoveAt(indexChar);
}
return result;
}
}
public class WordScramble : MonoBehaviour
{
public Word[] words;
[Header("UI Reference")]
public CharObject prefab;
public Transform container;
public float space;
public float lerpSpeed = 5;
List<CharObject> charObjects = new List<CharObject>();
CharObject firstSelected;
public int currentWord;
public static WordScramble main;
void Awake()
{
main = this;
}
// Use this for initialization
void Start()
{
ShowScramble(currentWord);
}
// Update is called once per frame
void Update()
{
RepositionObject();
}
void RepositionObject()
{
if (charObjects.Count == 0)
{
return;
}
float center = (charObjects.Count - 1) / 2;
for (int i = 0; i < charObjects.Count; i++)
{
charObjects[i].rectTransform.anchoredPosition
= Vector2.Lerp(charObjects[i].rectTransform.anchoredPosition,
new Vector2((i - center) * space, 0), lerpSpeed * Time.deltaTime);
charObjects[i].index = i;
}
}
public void ShowScramble()
{
ShowScramble(Random.Range(0, words.Length - 1));
}
public void ShowScramble(int index)
{
charObjects.Clear();
foreach (Transform child in container)
{
Destroy(child.gameObject);
}
if (index > words.Length - 1)
{
Debug.LogError("index out of range between 0-" + (words.Length - 1).ToString());
return;
}
// string result = huruf ;
// foreach (string words is word());
char[] chars = words[index].GetString().ToCharArray();
foreach (char c in chars)
{
CharObject clone = Instantiate(prefab.gameObject).GetComponent<CharObject>();
clone.transform.SetParent(container);
charObjects.Add(clone.Init(c));
}
currentWord = index;
}
public void Swap(int indexA, int indexB)
{
CharObject tmpA = charObjects[indexA];
charObjects[indexA] = charObjects[indexB];
charObjects[indexB] = tmpA;
charObjects[indexA].transform.SetAsLastSibling();
charObjects[indexB].transform.SetAsLastSibling();
CheckWord();
}
public void Select(CharObject charObject)
{
if (firstSelected)
{
Swap(firstSelected.index, charObject.index);
// unselect
//firstSelected = null;
firstSelected.Select();
charObject.Select();
}
else
{
firstSelected = charObject;
}
}
public void UnSelect()
{
firstSelected = null;
}
public void CheckWord()
{
StartCoroutine(CoCheckWord());
}
IEnumerator CoCheckWord()
{
yield return new WaitForSeconds(0.5f);
string word = "";
foreach (CharObject charObject in charObjects)
{
word += charObject.character;
}
if (word == words[currentWord].word)
{
currentWord++;
ShowScramble(currentWord);
}
}
}
this for a result
maybe i can get help for this problem, i'm still learn for C#, i'm sorry if my code is mess up
Your code is assigning the splits in a loop to the word variable over and over, overwriting word at each time. Then you are splitting word into its chars and try to shuffle them. You should instead shuffle the array of splits.
Example using LINQ:
public static string[] ScrambleSentence(string sentence)
{
var random = new Random();
return sentence.Split(' ').OrderBy(x => random.Next()).ToArray();
}
Here is a simple way to randomize word positions in a text
var rd = new Random();
string[] words = text.Split(' ').OrderBy(w => rd.Next()).ToArray();
// If you want a simple string instead of an array of words
string rdText = string.Join(" ", words);

Foreach won't return elements of char list

So im making this hangman game, as Im trying to learn C# but now im stuck with System.Collection.Generic.List'1[System.Char]. What im trying to do is to save wrong answers into List nepravilne, look into functions izpis and igra
class Program
{
static private int _sccore;
static void Main(string[] args)
{
string beseda;
int dolzina;
bool play=true;
char input;
do
{
beseda = izberi_besedo();
dolzina = beseda.Length;
igra(beseda, dolzina);
Console.WriteLine("Vnesite Y za nadaljevanje ali N za zakljucitev igre.");
input = char.Parse(Console.ReadLine());
if (input.Equals('y'))
{
play = true;
Console.WriteLine("play {0}",play);
}
if (input.Equals('n'))
{
play = false;
Console.WriteLine("play {0}", play);
}
} while (play == true);
}
static private string izberi_besedo() {
string[] besede = { "voda", "ladija", "letalo", "motor", "klavir", "harmonika", "saksofon", "oklep", "penkalo", "tiskalnik", "miza", "copat", "krogla", "klobuk", "gumb", "harfa", "kontrabas", "mandarina", "les", "knjiga", "vlak", "vijak", "struna", "kozarec" };
Random rnd = new Random();
int stevilka = rnd.Next(0, 23);
string beseda = besede[stevilka];
return beseda;
}
static private void igra (string beseda, int dolzina){
int i, poizkusi = 0;
int pravilne = 0;
bool endloop = false;
char crka;
List<char> nepravilne = new List<char>();//declaring char list for wrong words
string[] odkrite = new string[dolzina];
for(i=0; i<dolzina; i++) { odkrite[i] = "_"; }
do {
izpis(odkrite,nepravilne); //izpis - function which returns just text, we are inputing list nepravilne, which are wrong answers
vpis(out crka);
if (!(beseda.Contains(crka)))//if word doesen't contain letter
{
poizkusi++;
_sccore--;
nepravilne.Add(crka);//add that letter to list
}
for (i = 0; i<dolzina; i++)
{
if (crka.Equals(beseda[i]))
{
odkrite[i] = Convert.ToString(crka);
pravilne++;
_sccore++;
}
}
Console.Clear();
if (pravilne >= dolzina || poizkusi >= 4)endloop = true;
} while (endloop==false);
}
static private void vpis(out char crka)
{
string vpis;
bool stevilka=false, status;
Console.WriteLine("\nVnesite crko za ugibanje besede");
vpis = Console.ReadLine();
stevilka = IsNumeric(vpis);
if (vpis.Length == 1 && stevilka==false)
{
crka = Convert.ToChar(vpis);
}
else
{
do
{
status = false;
if (vpis.Length!=1) Console.WriteLine("Vnesli ste prevec crk, poizkusite ponovno");
if(stevilka==true) Console.WriteLine("Vnesli ste stevilko, poizkusite ponovno");
vpis = Console.ReadLine();
stevilka = IsNumeric(vpis);
if (vpis.Length == 1 && stevilka == false)
{
status = true;
}
} while (status==false);
crka = Convert.ToChar(vpis);
}
}
private static bool IsNumeric(string vpis)
{
int number;
return int.TryParse(vpis, out number);
}
private static void izpis(string[] odkrite, List<char> nepravilne)
{
Console.Write("Rezultat {0} | ", _sccore);
foreach (char element in nepravilne)//write out char elements which contain letter
{
Console.Write("{0} ", nepravilne);
}
Console.WriteLine();
foreach (string element in odkrite)
{
Console.Write("{0} ", element);
}
}
}
}
I think you have a typo in the following code, i.e I think you are intending to print the variable element in the loop and not nepravilne:
foreach (char element in nepravilne)
{
Console.Write("{0} ", nepravilne);
}
Should be as follows instead?
foreach (char element in nepravilne)
{
Console.Write("{0} ", element);
}

Importing and removing duplicates from a massive amount of text files using C# and Redis

This is a bit of a doozy and it's been a while since I worked with C#, so bear with me:
I'm running a jruby script to iterate through 900 files (5 Mb - 1500 Mb in size) to figure out how many dupes STILL exist within these (already uniq'd) files. I had little luck with awk.
My latest idea was to insert them into a local MongoDB instance like so:
db.collection('hashes').update({ :_id => hash}, { $inc: { count: 1} }, { upsert: true)
... so that later I could just query it like db.collection.where({ count: { $gt: 1 } }) to get all the dupes.
This is working great except it's been over 24 hours and at the time of writing I'm at 72,532,927 Mongo entries and growing.
I think Ruby's .each_line is bottlnecking the IO hardcore:
So what I'm thinking now is compiling a C# program which fires up a thread PER EACH FILE and inserts the line (md5 hash) into a Redis list.
From there, I could have another compiled C# program simply pop the values off and ignore the save if the count is 1.
So the questions are:
Will using a compiled file reader and multithreading the file reads significantly improve performance?
Is using Redis even necessary? With a tremendous amount of AWS memory, could I not just use the threads to fill some sort of a list atomically and proceed from there?
Thanks in advance.
Updated
New solution. Old solution. The main idea is to calculate dummy hashes(just sum of all chars in string) of each line and store it in Dictionary<ulong, List<LinePosition>> _hash2LinePositions. It's possible to have multiple hashes in the same stream and it solves by List in Dictionary Value. When the hashes are the same, we read and compare the strings from the streams. LinePosition is using for storing info about line - position in stream and its length. I don't have such huge files as you, but my tests shows that it works. Here is the full code:
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
public class Solution
{
struct LinePosition
{
public long Start;
public long Length;
public LinePosition(long start, long count)
{
Start = start;
Length = count;
}
public override string ToString()
{
return string.Format("Start: {0}, Length: {1}", Start, Length);
}
}
class TextFileHasher : IDisposable
{
readonly Dictionary<ulong, List<LinePosition>> _hash2LinePositions;
readonly Stream _stream;
bool _isDisposed;
public HashSet<ulong> Hashes { get; private set; }
public string Name { get; private set; }
public TextFileHasher(string name, Stream stream)
{
Name = name;
_stream = stream;
_hash2LinePositions = new Dictionary<ulong, List<LinePosition>>();
Hashes = new HashSet<ulong>();
}
public override string ToString()
{
return Name;
}
public void CalculateFileHash()
{
int readByte = -1;
ulong dummyLineHash = 0;
// Line start position in file
long startPosition = 0;
while ((readByte = _stream.ReadByte()) != -1) {
// Read until new line
if (readByte == '\r' || readByte == '\n') {
// If there was data
if (dummyLineHash != 0) {
// Add line hash and line position to the dict
AddToDictAndHash(dummyLineHash, startPosition, _stream.Position - 1 - startPosition);
// Reset line hash
dummyLineHash = 0;
}
}
else {
// Was it new line ?
if (dummyLineHash == 0)
startPosition = _stream.Position - 1;
// Calculate dummy hash
dummyLineHash += (uint)readByte;
}
}
if (dummyLineHash != 0) {
// Add line hash and line position to the dict
AddToDictAndHash(dummyLineHash, startPosition, _stream.Position - startPosition);
// Reset line hash
dummyLineHash = 0;
}
}
public List<LinePosition> GetLinePositions(ulong hash)
{
return _hash2LinePositions[hash];
}
public List<string> GetDuplicates()
{
List<string> duplicates = new List<string>();
foreach (var key in _hash2LinePositions.Keys) {
List<LinePosition> linesPos = _hash2LinePositions[key];
if (linesPos.Count > 1) {
duplicates.AddRange(FindExactDuplicates(linesPos));
}
}
return duplicates;
}
public void Dispose()
{
if (_isDisposed)
return;
_stream.Dispose();
_isDisposed = true;
}
private void AddToDictAndHash(ulong hash, long start, long count)
{
List<LinePosition> linesPosition;
if (!_hash2LinePositions.TryGetValue(hash, out linesPosition)) {
linesPosition = new List<LinePosition>() { new LinePosition(start, count) };
_hash2LinePositions.Add(hash, linesPosition);
}
else {
linesPosition.Add(new LinePosition(start, count));
}
Hashes.Add(hash);
}
public byte[] GetLineAsByteArray(LinePosition prevPos)
{
long len = prevPos.Length;
byte[] lineBytes = new byte[len];
_stream.Seek(prevPos.Start, SeekOrigin.Begin);
_stream.Read(lineBytes, 0, (int)len);
return lineBytes;
}
private List<string> FindExactDuplicates(List<LinePosition> linesPos)
{
List<string> duplicates = new List<string>();
linesPos.Sort((x, y) => x.Length.CompareTo(y.Length));
LinePosition prevPos = linesPos[0];
for (int i = 1; i < linesPos.Count; i++) {
if (prevPos.Length == linesPos[i].Length) {
var prevLineArray = GetLineAsByteArray(prevPos);
var thisLineArray = GetLineAsByteArray(linesPos[i]);
if (prevLineArray.SequenceEqual(thisLineArray)) {
var line = System.Text.Encoding.Default.GetString(prevLineArray);
duplicates.Add(line);
}
#if false
string prevLine = System.Text.Encoding.Default.GetString(prevLineArray);
string thisLine = System.Text.Encoding.Default.GetString(thisLineArray);
Console.WriteLine("PrevLine: {0}\r\nThisLine: {1}", prevLine, thisLine);
StringBuilder sb = new StringBuilder();
sb.Append(prevPos);
sb.Append(" is '");
sb.Append(prevLine);
sb.Append("'. ");
sb.AppendLine();
sb.Append(linesPos[i]);
sb.Append(" is '");
sb.Append(thisLine);
sb.AppendLine("'. ");
sb.Append("Equals => ");
sb.Append(prevLine.CompareTo(thisLine) == 0);
Console.WriteLine(sb.ToString());
#endif
}
else {
prevPos = linesPos[i];
}
}
return duplicates;
}
}
public static void Main(String[] args)
{
List<TextFileHasher> textFileHashers = new List<TextFileHasher>();
string text1 = "abc\r\ncba\r\nabc";
TextFileHasher tfh1 = new TextFileHasher("Text1", new MemoryStream(System.Text.Encoding.Default.GetBytes(text1)));
tfh1.CalculateFileHash();
textFileHashers.Add(tfh1);
string text2 = "def\r\ncba\r\nwet";
TextFileHasher tfh2 = new TextFileHasher("Text2", new MemoryStream(System.Text.Encoding.Default.GetBytes(text2)));
tfh2.CalculateFileHash();
textFileHashers.Add(tfh2);
string text3 = "def\r\nbla\r\nwat";
TextFileHasher tfh3 = new TextFileHasher("Text3", new MemoryStream(System.Text.Encoding.Default.GetBytes(text3)));
tfh3.CalculateFileHash();
textFileHashers.Add(tfh3);
List<string> totalDuplicates = new List<string>();
Dictionary<ulong, Dictionary<TextFileHasher, List<LinePosition>>> totalHashes = new Dictionary<ulong, Dictionary<TextFileHasher, List<LinePosition>>>();
textFileHashers.ForEach(tfh => {
foreach(var dummyHash in tfh.Hashes) {
Dictionary<TextFileHasher, List<LinePosition>> tfh2LinePositions = null;
if (!totalHashes.TryGetValue(dummyHash, out tfh2LinePositions))
totalHashes[dummyHash] = new Dictionary<TextFileHasher, List<LinePosition>>() { { tfh, tfh.GetLinePositions(dummyHash) } };
else {
List<LinePosition> linePositions = null;
if (!tfh2LinePositions.TryGetValue(tfh, out linePositions))
tfh2LinePositions[tfh] = tfh.GetLinePositions(dummyHash);
else
linePositions.AddRange(tfh.GetLinePositions(dummyHash));
}
}
});
HashSet<TextFileHasher> alreadyGotDuplicates = new HashSet<TextFileHasher>();
foreach(var hash in totalHashes.Keys) {
var tfh2LinePositions = totalHashes[hash];
var tfh = tfh2LinePositions.Keys.FirstOrDefault();
// Get duplicates in the TextFileHasher itself
if (tfh != null && !alreadyGotDuplicates.Contains(tfh)) {
totalDuplicates.AddRange(tfh.GetDuplicates());
alreadyGotDuplicates.Add(tfh);
}
if (tfh2LinePositions.Count <= 1) {
continue;
}
// Algo to get duplicates in more than 1 TextFileHashers
var tfhs = tfh2LinePositions.Keys.ToArray();
for (int i = 0; i < tfhs.Length; i++) {
var tfh1Positions = tfhs[i].GetLinePositions(hash);
for (int j = i + 1; j < tfhs.Length; j++) {
var tfh2Positions = tfhs[j].GetLinePositions(hash);
for (int k = 0; k < tfh1Positions.Count; k++) {
var tfh1Pos = tfh1Positions[k];
var tfh1ByteArray = tfhs[i].GetLineAsByteArray(tfh1Pos);
for (int m = 0; m < tfh2Positions.Count; m++) {
var tfh2Pos = tfh2Positions[m];
if (tfh1Pos.Length != tfh2Pos.Length)
continue;
var tfh2ByteArray = tfhs[j].GetLineAsByteArray(tfh2Pos);
if (tfh1ByteArray.SequenceEqual(tfh2ByteArray)) {
var line = System.Text.Encoding.Default.GetString(tfh1ByteArray);
totalDuplicates.Add(line);
}
}
}
}
}
}
Console.WriteLine();
if (totalDuplicates.Count > 0) {
Console.WriteLine("Total number of duplicates: {0}", totalDuplicates.Count);
Console.WriteLine("#######################");
totalDuplicates.ForEach(x => Console.WriteLine("{0}", x));
Console.WriteLine("#######################");
}
// Free resources
foreach (var tfh in textFileHashers)
tfh.Dispose();
}
}
If you have tons of ram... You guys are overthinking it...
var fileLines = File.ReadAllLines(#"c:\file.csv").Distinct();

Split a range sequence into multiple string c#,linq [closed]

Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 7 years ago.
Improve this question
Not sure why question is being marked as offtopic, where as so called desired behaviour is included within the question post!
I am trying to write this program that takes two inputs:
• a set of include intervals
• and a set of exclude intervals
The sets of intervals can be given in any order, and they may be empty or overlapping. The program should output the result of taking all the includes and “remove” the excludes. The output should be given as non-overlapping intervals in a sorted order.
Intervals will contain Integers only
Example :
Includes: 50-600, 10-100
Excludes: (empty)
Output: 10-600
Includes: 10-100, 200-300, 400-600
Excludes: 95-205, 410-420
Output: 10-94, 206-300, 400-409, 421-600
I tried to populate two Enumerable Range from include and excludes (after splitting,parsing ), but didn't find any efficient way of implementing this afterwards.
string[] _break = _string.Split(',');
string[] _breakB = _stringB.Split(',');
string[] res = new string[_break.Length + 1];
string[] _items, _itemsB;
List < int > _back = new List < int > ();
int count = 0;
foreach(var _item in _break) {
_items = _item.Split('-');
var a = Enumerable.Range(int.Parse(_items[0]), (int.Parse(_items[1]) - int.Parse(_items[0]) + 1)).ToList();
foreach(var _itemB in _breakB) {
_itemsB = _itemB.Split('-');
var b = Enumerable.Range(int.Parse((_itemsB[0])), (int.Parse(_itemsB[1]) - int.Parse((_itemsB[0])) + 1)).ToList();
var c = a.Except < int > (b).ToList();
/// different things tried here, but they are not good
res[count] = c.Min().ToString() + "-" + c.Max().ToString();
count++;
}
}
return res;
Any input will be of great help
You can use the Built-in SortedSet<T> collection to do most of the work for you like this:
The SortedSet<T> collection implements the useful UnionWith and ExceptWith methods which at least makes the code quite easy to follow:
private void button1_Click(object sender, EventArgs e)
{
string[] includeRanges = _string.Text.Replace(" ", "").Split(',');
string[] excludeRanges = _stringB.Text.Replace(" ", "").Split(',');
string[] includeRange, excludeRange;
SortedSet<int> includeSet = new SortedSet<int>();
SortedSet<int> excludeSet = new SortedSet<int>();
// Create a UNION of all the include ranges
foreach (string item in includeRanges)
{
includeRange = item.Split('-');
includeSet.UnionWith(Enumerable.Range(int.Parse(includeRange[0]), (int.Parse(includeRange[1]) - int.Parse(includeRange[0]) + 1)).ToList());
}
// Create a UNION of all the exclude ranges
foreach (string item in excludeRanges)
{
excludeRange = item.Split('-');
excludeSet.UnionWith(Enumerable.Range(int.Parse(excludeRange[0]), (int.Parse(excludeRange[1]) - int.Parse(excludeRange[0]) + 1)).ToList());
}
// Exclude the excludeSet from the includeSet
includeSet.ExceptWith(excludeSet);
//Format the output using a stringbuilder
StringBuilder sb = new StringBuilder();
int lastValue = -1;
foreach (int included in includeSet)
{
if (lastValue == -1)
{
sb.Append(included + "-");
lastValue = included;
}
else
{
if (lastValue == included - 1)
{
lastValue = included;
}
else
{
sb.Append(lastValue + ",");
sb.Append(included + "-");
lastValue = included;
}
}
}
sb.Append(lastValue);
result.Text = sb.ToString();
}
This should work faster than SortedSet trick, at least for large intervals. Idea is like:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
namespace Test
{
using Pair = Tuple<int, int>; //for brevity
struct Point //point of an interval
{
public enum Border { Left, Right };
public enum Interval { Including, Excluding };
public int Val;
public int Brdr;
public int Intr;
public Point(int value, Border border, Interval interval)
{
Val = value;
Brdr = (border == Border.Left) ? 1 : -1;
Intr = (int)interval;
}
public override string ToString() =>
(Brdr == 1 ? "L" : "R") + (Intr == 0 ? "+ " : "- ") + Val;
}
class Program
{
static IEnumerable<Pair> GetInterval(string strIn, string strEx)
{
//a func to get interval border points from string:
Func<string, Point.Interval, IEnumerable<Point>> parse = (str, intr) =>
Regex.Matches(str, "[0-9]+").Cast<Match>().Select((s, idx) =>
new Point(int.Parse(s.Value), (Point.Border)(idx % 2), intr));
var INs = parse(strIn, Point.Interval.Including);
var EXs = parse(strEx, Point.Interval.Excluding);
var intrs = new int[2]; //current interval border control IN[0], EX[1]
int start = 0; //left border of a new resulting interval
//put all points in a line and loop:
foreach (var p in INs.Union(EXs).OrderBy(x => x.Val))
{
//check for start (close) of a new (cur) interval:
var change = (intrs[p.Intr] == 0) ^ (intrs[p.Intr] + p.Brdr == 0);
intrs[p.Intr] += p.Brdr;
if (!change) continue;
var In = p.Intr == 0 && intrs[1] == 0; //w no Ex
var Ex = p.Intr == 1 && intrs[0] > 0; //breaks In
var Open = intrs[p.Intr] > 0;
var Close = !Open;
if (In && Open || Ex && Close)
{
start = p.Val + p.Intr; //exclude point if Ex
}
else if (In && Close || Ex && Open)
{
yield return new Pair(start, p.Val - p.Intr);
}
}
}
static void Main(string[] args)
{
var strIN = "10-100, 200-300, 400-500, 420-480";
var strEX = "95-205, 410-420";
foreach (var i in GetInterval(strIN, strEX))
Console.WriteLine(i.Item1 + "-" + i.Item2);
Console.ReadLine();
}
}
}
So, you task could be separated to the list of subtasks:
Parse a source line of intervals to the list of objects
Concatinate intervals if they cross each over
Excludes intervals 'excludes' from 'includes'
I published my result code here: http://rextester.com/OBXQ56769
The code could be optimized as well, but I wanted it to be quite simple. Hope it will help you.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
namespace ConsoleApplication
{
public class Program
{
private const string Includes = "10-100, 200-300, 400-500 ";
private const string Excludes = "95-205, 410-420";
private const string Pattern = #"(\d*)-(\d*)";
public static void Main(string[] args)
{
var includes = ParseIntevals(Includes);
var excludes = ParseIntevals(Excludes);
includes = ConcatinateIntervals(includes);
excludes = ConcatinateIntervals(excludes);
// The Result
var result = ExcludeFromInclude(includes, excludes);
foreach (var interval in result)
{
Console.WriteLine(interval.Min + "-" + interval.Max);
}
}
/// <summary>
/// Excludes intervals 'excludes' from 'includes'
/// </summary>
public static List<Interval> ExcludeFromInclude(List<Interval> includes, List<Interval> excludes)
{
var result = new List<Interval>();
if (!excludes.Any())
{
return includes.Select(x => x.Clone()).ToList();
}
for (int i = 0; i < includes.Count; i++)
{
for (int j = 0; j < excludes.Count; j++)
{
if (includes[i].Max < excludes[j].Min || includes[i].Min > excludes[j].Max)
continue; // no crossing
//1 Example: includes[i]=(10-20) excludes[j]=(15-25)
if (includes[i].Min < excludes[j].Min && includes[i].Max <= excludes[j].Max)
{
var interval = new Interval(includes[i].Min, excludes[j].Min - 1);
result.Add(interval);
break;
}
//2 Example: includes[i]=(10-25) excludes[j]=(15-20)
if (includes[i].Min <= excludes[j].Min && includes[i].Max >= excludes[j].Max)
{
if (includes[i].Min < excludes[j].Min)
{
var interval1 = new Interval(includes[i].Min, excludes[j].Min - 1);
result.Add(interval1);
}
if (includes[i].Max > excludes[j].Max)
{
var interval2 = new Interval(excludes[j].Max + 1, includes[i].Max);
result.Add(interval2);
}
break;
}
//3 Example: includes[i]=(15-25) excludes[j]=(10-20)
if (includes[i].Min < excludes[j].Max && includes[i].Max > excludes[j].Max)
{
var interval = new Interval(excludes[j].Max + 1, includes[i].Max);
result.Add(interval);
break;
}
}
}
return result;
}
/// <summary>
/// Concatinates intervals if they cross each over
/// </summary>
public static List<Interval> ConcatinateIntervals(List<Interval> intervals)
{
var result = new List<Interval>();
for (int i = 0; i < intervals.Count; i++)
{
for (int j = 0; j < intervals.Count; j++)
{
if (i == j)
continue;
if (intervals[i].Max < intervals[j].Min || intervals[i].Min > intervals[j].Max)
{
Interval interval = intervals[i].Clone();
result.Add(interval);
continue; // no crossing
}
//1
if (intervals[i].Min < intervals[j].Min && intervals[i].Max < intervals[j].Max)
{
var interval = new Interval(intervals[i].Min, intervals[j].Max);
result.Add(interval);
break;
}
//2
if (intervals[i].Min < intervals[j].Min && intervals[i].Max > intervals[j].Max)
{
Interval interval = intervals[i].Clone();
result.Add(interval);
break;
}
//3
if (intervals[i].Min < intervals[j].Max && intervals[i].Max > intervals[j].Max)
{
var interval = new Interval(intervals[j].Min, intervals[i].Max);
result.Add(interval);
break;
}
//4
if (intervals[i].Min > intervals[j].Min && intervals[i].Max < intervals[j].Max)
{
var interval = new Interval(intervals[j].Min, intervals[j].Max);
result.Add(interval);
break;
}
}
}
return result.Distinct().ToList();
}
/// <summary>
/// Parses a source line of intervals to the list of objects
/// </summary>
public static List<Interval> ParseIntevals(string intervals)
{
var matches = Regex.Matches(intervals, Pattern, RegexOptions.IgnoreCase);
var list = new List<Interval>();
foreach (Match match in matches)
{
var min = int.Parse(match.Groups[1].Value);
var max = int.Parse(match.Groups[2].Value);
list.Add(new Interval(min, max));
}
return list.OrderBy(x => x.Min).ToList();
}
/// <summary>
/// Interval
/// </summary>
public class Interval
{
public int Min { get; set; }
public int Max { get; set; }
public Interval()
{
}
public Interval(int min, int max)
{
Min = min;
Max = max;
}
public override bool Equals(object obj)
{
var obj2 = obj as Interval;
if (obj2 == null) return false;
return obj2.Min == Min && obj2.Max == Max;
}
public override int GetHashCode()
{
return this.ToString().GetHashCode();
}
public override string ToString()
{
return string.Format("{0}-{1}", Min, Max);
}
public Interval Clone()
{
return (Interval) this.MemberwiseClone();
}
}
}
}
Lots of ways to solve this. The LINQ approach hasn't been discussed yet - this is how I would do it:
// declaring a lambda fn because it's gonna be used by both include/exclude
// list
Func<string, IEnumerable<int>> rangeFn =
baseInput =>
{
return baseInput.Split (new []{ ',', ' ' },
StringSplitOptions.RemoveEmptyEntries)
.SelectMany (rng =>
{
var range = rng.Split (new []{ '-' },
StringSplitOptions.RemoveEmptyEntries)
.Select(i => Convert.ToInt32(i));
// just in case someone types in
// a reverse range (e.g. 10-5), LOL...
var start = range.Min ();
var end = range.Max ();
return Enumerable.Range (start, (end - start + 1));
});
};
var includes = rangeFn (_string);
var excludes = rangeFn (_stringB);
var result = includes.Except (excludes).Distinct().OrderBy(r => r);

How to separate character and number part from string

E.g., I would like to separate:
OS234 to OS and 234
AA4230 to AA and 4230
I have used following trivial solution, but I am quite sure that there should be a more efficient and robust solution .
private void demo()
{ string cell="ABCD4321";
int a = getIndexofNumber(cell);
string Numberpart = cell.Substring(a, cell.Length - a);
row = Convert.ToInt32(rowpart);
string Stringpart = cell.Substring(0, a);
}
private int getIndexofNumber(string cell)
{
int a = -1, indexofNum = 10000;
a = cell.IndexOf("0"); if (a > -1) { if (indexofNum > a) { indexofNum = a; } }
a = cell.IndexOf("1"); if (a > -1) { if (indexofNum > a) { indexofNum = a; } }
a = cell.IndexOf("2"); if (a > -1) { if (indexofNum > a) { indexofNum = a; } }
a = cell.IndexOf("3"); if (a > -1) { if (indexofNum > a) { indexofNum = a; } }
a = cell.IndexOf("4"); if (a > -1) { if (indexofNum > a) { indexofNum = a; } }
a = cell.IndexOf("5"); if (a > -1) { if (indexofNum > a) { indexofNum = a; } }
a = cell.IndexOf("6"); if (a > -1) { if (indexofNum > a) { indexofNum = a; } }
a = cell.IndexOf("7"); if (a > -1) { if (indexofNum > a) { indexofNum = a; } }
a = cell.IndexOf("8"); if (a > -1) { if (indexofNum > a) { indexofNum = a; } }
a = cell.IndexOf("9"); if (a > -1) { if (indexofNum > a) { indexofNum = a; } }
if (indexofNum != 10000)
{ return indexofNum; }
else
{ return 0; }
}
Regular Expressions are best suited for this kind of work:
using System.Text.RegularExpressions;
Regex re = new Regex(#"([a-zA-Z]+)(\d+)");
Match result = re.Match(input);
string alphaPart = result.Groups[1].Value;
string numberPart = result.Groups[2].Value;
Use Linq to do this
string str = "OS234";
var digits = from c in str
select c
where Char.IsDigit(c);
var alphas = from c in str
select c
where !Char.IsDigit(c);
Everyone and their mother will give you a solution using regex, so here's one that is not:
// s is string of form ([A-Za-z])*([0-9])* ; char added
int index = s.IndexOfAny(new char[] { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' });
string chars = s.Substring(0, index);
int num = Int32.Parse(s.Substring(index));
I really like jason's answer. Lets improve it a bit. We dont need regex here. My solution handle input like "H1N1":
public static IEnumerable<string> SplitAlpha(string input)
{
var words = new List<string> { string.Empty };
for (var i = 0; i < input.Length; i++)
{
words[words.Count-1] += input[i];
if (i + 1 < input.Length && char.IsLetter(input[i]) != char.IsLetter(input[i + 1]))
{
words.Add(string.Empty);
}
}
return words;
}
This solution is linear O(n).
output
"H1N1" -> ["H", "1", "N", "1"]
"H" -> ["H"]
"GH1N12" -> ["GH", "1", "N", "12"]
"OS234" -> ["OS", "234"]
Same solution with a StringBuilder
public static IEnumerable<string> SplitAlpha(string input)
{
var words = new List<StringBuilder>{new StringBuilder()};
for (var i = 0; i < input.Length; i++)
{
words[words.Count - 1].Append(input[i]);
if (i + 1 < input.Length && char.IsLetter(input[i]) != char.IsLetter(input[i + 1]))
{
words.Add(new StringBuilder());
}
}
return words.Select(x => x.ToString());
}
Try it Online!
If you want resolve more occurrences of char followed by number or vice versa you can use
private string SplitCharsAndNums(string text)
{
var sb = new StringBuilder();
for (var i = 0; i < text.Length - 1; i++)
{
if ((char.IsLetter(text[i]) && char.IsDigit(text[i+1])) ||
(char.IsDigit(text[i]) && char.IsLetter(text[i+1])))
{
sb.Append(text[i]);
sb.Append(" ");
}
else
{
sb.Append(text[i]);
}
}
sb.Append(text[text.Length-1]);
return sb.ToString();
}
And then
var text = SplitCharsAndNums("asd1 asas4gr5 6ssfd");
var tokens = text.Split(' ');
Are you doing this for sorting purposes? If so, keep in mind that Regex can kill performance for large lists. I frequently use an AlphanumComparer that's a general solution to this problem (can handle any sequence of letters and numbers in any order). I believe that I adapted it from this page.
Even if you're not sorting on it, using the character-by-character approach (if you have variable lengths) or simple substring/parse (if they're fixed) will be a lot more efficient and easier to test than a Regex.
I have used bniwredyc's answer to get Improved version of my routine:
private void demo()
{
string cell = "ABCD4321";
int row, a = getIndexofNumber(cell);
string Numberpart = cell.Substring(a, cell.Length - a);
row = Convert.ToInt32(Numberpart);
string Stringpart = cell.Substring(0, a);
}
private int getIndexofNumber(string cell)
{
int indexofNum=-1;
foreach (char c in cell)
{
indexofNum++;
if (Char.IsDigit(c))
{
return indexofNum;
}
}
return indexofNum;
}
.NET 2.0 compatible, without regex
public class Result
{
private string _StringPart;
public string StringPart
{
get { return _StringPart; }
}
private int _IntPart;
public int IntPart
{
get { return _IntPart; }
}
public Result(string stringPart, int intPart)
{
_StringPart = stringPart;
_IntPart = intPart;
}
}
class Program
{
public static Result GetResult(string source)
{
string stringPart = String.Empty;
int intPart;
var buffer = new StringBuilder();
foreach (char c in source)
{
if (Char.IsDigit(c))
{
if (stringPart == String.Empty)
{
stringPart = buffer.ToString();
buffer.Remove(0, buffer.Length);
}
}
buffer.Append(c);
}
if (!int.TryParse(buffer.ToString(), out intPart))
{
return null;
}
return new Result(stringPart, intPart);
}
static void Main(string[] args)
{
Result result = GetResult("OS234");
Console.WriteLine("String part: {0} int part: {1}", result.StringPart, result.IntPart);
result = GetResult("AA4230 ");
Console.WriteLine("String part: {0} int part: {1}", result.StringPart, result.IntPart);
result = GetResult("ABCD4321");
Console.WriteLine("String part: {0} int part: {1}", result.StringPart, result.IntPart);
Console.ReadKey();
}
}
Just use the substring function and set position inside the bracket.
String id = "DON123";
System.out.println("Id nubmer is : "+id.substring(3,6));
Answer:
Id number is: 123
use Split to seprate string from sting that use tab \t and space
string s = "sometext\tsometext\tsometext";
string[] split = s.Split('\t');
now you have an array of string that you want too easy

Categories