I want to split a Word document into separate pages and save them to files like Page_1.docx; Page_2.docx ......
At first I tried to do this:
using (var sourceWordDoc = WordprocessingDocument.Open(
wordFilePath
, false))
{
var sourceElements = sourceWordDoc.MainDocumentPart.Document.Body.Elements();
var pageElements = new List<DocumentFormat.OpenXml.OpenXmlElement>();
var pageIndex = 1;
foreach (var sourceElement in sourceElements)
{
var run = sourceElement.GetFirstChild<Run>();
if (run != null)
{
var lastRenderedPageBreak = run.GetFirstChild<LastRenderedPageBreak>();
var pageBreak = run.GetFirstChild<Break>();
if (lastRenderedPageBreak != null || pageBreak != null)
{
//Create and save page
using (var destinationWordDoc = WordprocessingDocument.Create(
$"Page_{pageIndex}.docx",
DocumentFormat.OpenXml.WordprocessingDocumentType.Document))
{
var destinationPart = destinationWordDoc.AddMainDocumentPart();
destinationPart.Document = new Document();
var destinationBody = destinationPart.Document.AppendChild(new Body());
foreach (var pageElement in pageElements)
{
destinationBody.Append(pageElement.CloneNode(true));
}
destinationPart.Document.Save();
}
pageElements.Clear();
pageIndex++;
}
}
pageElements.Add(sourceElement);
}
}
But in this variand, Header is not copied.
Then I wrote the following code to copy the Header (for test):
using (var sourceWordDoc = WordprocessingDocument.Open(
wordFilePath
, false))
{
var sourceHeaderPart = sourceWordDoc.MainDocumentPart.HeaderParts.FirstOrDefault();
var sourceDocument = sourceWordDoc.MainDocumentPart.Document;
using (var destinationWordDoc = WordprocessingDocument.Create(
somePath
, WordprocessingDocumentType.Document))
{
var destinationMainDocumentPart = destinationWordDoc.AddMainDocumentPart();
destinationMainDocumentPart.Document = new DocumentFormat.OpenXml.Wordprocessing.Document();
var destinationBody = destinationMainDocumentPart.Document.AppendChild(new DocumentFormat.OpenXml.Wordprocessing.Body());
var rId = string.Empty;
if (sourceHeaderPart != null)
{
var destinationHeaderPart = destinationMainDocumentPart.AddNewPart<HeaderPart>();
rId = destinationMainDocumentPart.GetIdOfPart(destinationHeaderPart);
destinationHeaderPart.FeedData(sourceHeaderPart.GetStream());
}
var sourceElements = sourceDocument.Body.Elements();
foreach (var sourceElement in sourceElements)
{
if (sourceElement.GetType() == typeof(DocumentFormat.OpenXml.Wordprocessing.SectionProperties))
{
var sourceSectionsElements = sourceElement.Elements();
foreach (var sourceSectionsElement in sourceSectionsElements)
{
if (sourceSectionsElement.GetType() == typeof(DocumentFormat.OpenXml.Wordprocessing.HeaderReference))
{
var sourceHeaderReference = (DocumentFormat.OpenXml.Wordprocessing.HeaderReference)sourceSectionsElement;
sourceHeaderReference.Id = rId;
break;
}
}
}
destinationBody.Append(sourceElement.CloneNode(true));
}
}
}
But now I have a new problem, images from the header are not transferred.
Are there any options to break the document into separate pages and at the same time preserve the entire content of the page?
I have a working solution for uploading a CSV file. Currently, I use the IFormCollection for a user to upload multiple CSV files from a view.
The CSV files are saved as a temp file as follows:
List<string> fileLocations = new List<string>();
foreach (var formFile in files)
{
filePath = Path.GetTempFileName();
if (formFile.Length > 0)
{
using (var stream = new FileStream(filePath, FileMode.Create))
{
await formFile.CopyToAsync(stream);
}
}
fileLocations.Add(filePath);
}
I send the list of file locations to another method (just below). I loop through the file locations and stream the data from the temp files, I then use a data table and SqlBulkCopyto insert the data. I currently upload between 50 and 200 files at a time and each file is around 330KB. To insert a hundred, it takes around 6 minutes, which is around 30-35MB.
public void SplitCsvData(string fileLocation, Guid uid)
{
MetaDataModel MetaDatas;
List<RawDataModel> RawDatas;
var reader = new StreamReader(File.OpenRead(fileLocation));
List<string> listRows = new List<string>();
while (!reader.EndOfStream)
{
listRows.Add(reader.ReadLine());
}
var metaData = new List<string>();
var rawData = new List<string>();
foreach (var row in listRows)
{
var rowName = row.Split(',')[0];
bool parsed = int.TryParse(rowName, out int result);
if (parsed == false)
{
metaData.Add(row);
}
else
{
rawData.Add(row);
}
}
//Assigns the vertical header name and value to the object by splitting string
RawDatas = GetRawData.SplitRawData(rawData);
SaveRawData(RawDatas);
MetaDatas = GetMetaData.SplitRawData(rawData);
SaveRawData(RawDatas);
}
This code then passes the object to the to create the datatable and insert the data.
private DataTable CreateRawDataTable
{
get
{
var dt = new DataTable();
dt.Columns.Add("Id", typeof(int));
dt.Columns.Add("SerialNumber", typeof(string));
dt.Columns.Add("ReadingNumber", typeof(int));
dt.Columns.Add("ReadingDate", typeof(string));
dt.Columns.Add("ReadingTime", typeof(string));
dt.Columns.Add("RunTime", typeof(string));
dt.Columns.Add("Temperature", typeof(double));
dt.Columns.Add("ProjectGuid", typeof(Guid));
dt.Columns.Add("CombineDateTime", typeof(string));
return dt;
}
}
public void SaveRawData(List<RawDataModel> data)
{
DataTable dt = CreateRawDataTable;
var count = data.Count;
for (var i = 1; i < count; i++)
{
DataRow row = dt.NewRow();
row["Id"] = data[i].Id;
row["ProjectGuid"] = data[i].ProjectGuid;
row["SerialNumber"] = data[i].SerialNumber;
row["ReadingNumber"] = data[i].ReadingNumber;
row["ReadingDate"] = data[i].ReadingDate;
row["ReadingTime"] = data[i].ReadingTime;
row["CombineDateTime"] = data[i].CombineDateTime;
row["RunTime"] = data[i].RunTime;
row["Temperature"] = data[i].Temperature;
dt.Rows.Add(row);
}
using (var conn = new SqlConnection(connectionString))
{
conn.Open();
using (SqlTransaction tr = conn.BeginTransaction())
{
using (var sqlBulk = new SqlBulkCopy(conn, SqlBulkCopyOptions.Default, tr))
{
sqlBulk.BatchSize = 1000;
sqlBulk.DestinationTableName = "RawData";
sqlBulk.WriteToServer(dt);
}
tr.Commit();
}
}
}
Is there another way to do this or a better way to improve performance so that the time to upload is reduced as it can take a long time and I am seeing an ever increasing use of memory to around 500MB.
TIA
You can improve performance by removing the DataTable and reading from the input stream directly.
SqlBulkCopy has a WriteToServer overload that accepts an IDataReader instead of an entire DataTable.
CsvHelper can CSV files using a StreamReader as an input. It provides CsvDataReader as an IDataReader implementation on top of the CSV data. This allows reading directly from the input stream and writing to SqlBulkCopy.
The following method will read from an IFormFile, parse the stream using CsvHelper and use the CSV's fields to configure a SqlBulkCopy instance :
public async Task ToTable(IFormFile file, string table)
{
using (var stream = file.OpenReadStream())
using (var tx = new StreamReader(stream))
using (var reader = new CsvReader(tx))
using (var rd = new CsvDataReader(reader))
{
var headers = reader.Context.HeaderRecord;
var bcp = new SqlBulkCopy(_connection)
{
DestinationTableName = table
};
//Assume the file headers and table fields have the same names
foreach(var header in headers)
{
bcp.ColumnMappings.Add(header, header);
}
await bcp.WriteToServerAsync(rd);
}
}
This way nothing is ever written to a temp table or cached in memory. The uploaded files are parsed and written to the database directly.
In addition to #Panagiotis's answer, why don't you interleave your file processing with the file upload? Wrap up your file processing logic in an async method and change the loop to a Parallel.Foreach and process each file as it arrives instead of waiting for all of them?
private static readonly object listLock = new Object(); // only once at class level
List<string> fileLocations = new List<string>();
Parallel.ForEach(files, (formFile) =>
{
filePath = Path.GetTempFileName();
if (formFile.Length > 0)
{
using (var stream = new FileStream(filePath, FileMode.Create))
{
await formFile.CopyToAsync(stream);
}
await ProcessFileInToDbAsync(filePath);
}
// Added lock for thread safety of the List
lock (listLock)
{
fileLocations.Add(filePath);
}
});
Thanks to #Panagiotis Kanavos, I was able to work out what to do. Firstly, the way I was calling the methods, was leaving them in memory. The CSV file I have is in two parts, vertical metadata and then the usual horizontal information. So I needed to split them into two. Saving them as tmp files was also causing an overhead. It has gone from taking 5-6 minutes to now taking a minute, which for a 100 files containing 8,500 rows isn't bad I suppose.
Calling the method:
public async Task<IActionResult> UploadCsvFiles(ICollection<IFormFile> files, IFormCollection fc)
{
foreach (var f in files)
{
var getData = new GetData(_configuration);
await getData.SplitCsvData(f, uid);
}
return whatever;
}
This is the method doing the splitting:
public async Task SplitCsvData(IFormFile file, string uid)
{
var data = string.Empty;
var m = new List<string>();
var r = new List<string>();
var records = new List<string>();
using (var stream = file.OpenReadStream())
using (var reader = new StreamReader(stream))
{
while (!reader.EndOfStream)
{
var line = reader.ReadLine();
var header = line.Split(',')[0].ToString();
bool parsed = int.TryParse(header, out int result);
if (!parsed)
{
m.Add(line);
}
else
{
r.Add(line);
}
}
}
//TODO: Validation
//This splits the list into the Meta data model. This is just a single object, with static fields.
var metaData = SplitCsvMetaData.SplitMetaData(m, uid);
DataTable dtm = CreateMetaData(metaData);
var serialNumber = metaData.LoggerId;
await SaveMetaData("MetaData", dtm);
//
var lrd = new List<RawDataModel>();
foreach (string row in r)
{
lrd.Add(new RawDataModel
{
Id = 0,
SerialNumber = serialNumber,
ReadingNumber = Convert.ToInt32(row.Split(',')[0]),
ReadingDate = Convert.ToDateTime(row.Split(',')[1]).ToString("yyyy-MM-dd"),
ReadingTime = Convert.ToDateTime(row.Split(',')[2]).ToString("HH:mm:ss"),
RunTime = row.Split(',')[3].ToString(),
Temperature = Convert.ToDouble(row.Split(',')[4]),
ProjectGuid = uid.ToString(),
CombineDateTime = Convert.ToDateTime(row.Split(',')[1] + " " + row.Split(',')[2]).ToString("yyyy-MM-dd HH:mm:ss")
});
}
await SaveRawData("RawData", lrd);
}
I then use a data table for the metadata (which takes 20 seconds for a 100 files) as I map the field names to the columns.
public async Task SaveMetaData(string table, DataTable dt)
{
using (SqlBulkCopy sqlBulk = new SqlBulkCopy(_configuration.GetConnectionString("DefaultConnection"), SqlBulkCopyOptions.Default))
{
sqlBulk.DestinationTableName = table;
await sqlBulk.WriteToServerAsync(dt);
}
}
I then use FastMember for the large data parts for the raw data, which is more like a traditional CSV.
public async Task SaveRawData(string table, IEnumerable<LogTagRawDataModel> lrd)
{
using (SqlBulkCopy sqlBulk = new SqlBulkCopy(_configuration.GetConnectionString("DefaultConnection"), SqlBulkCopyOptions.Default))
using (var reader = ObjectReader.Create(lrd, "Id","SerialNumber", "ReadingNumber", "ReadingDate", "ReadingTime", "RunTime", "Temperature", "ProjectGuid", "CombineDateTime"))
{
sqlBulk.DestinationTableName = table;
await sqlBulk.WriteToServerAsync(reader);
}
}
I am sure this can be improved on, but for now, this works really well.
I am attempting to recursively parse through a zip folder and any zip folders inside it to collect files using the following code:
private IList<IFile> _getFilesFromZip(ZipArchive zipArchive)
{
var returnFiles = new List<IFile>();
foreach (var zippedFile in zipArchive.Entries)
{
var name = zippedFile.Name;
var type = Path.GetExtension(zippedFile.Name).Replace(".", "");
if (type == "zip")
{
var innerZipArchive = new ZipArchive(zippedFile.Open(), ZipArchiveMode.Read);
returnFiles.AddRange(_getConversionFilesFromZip(zipArchive));
}
else
{ ...
var innerZipArchive = new ZipArchive(zippedFile.Open(), ZipArchiveMode.Read);
Throws a StackOverflow exception no matter how small the files.
How can I create a zipArchive from the ZipArchiveEntry of the first ZipArchive?
I made a bot application with the Microsoft Botbuilder. Now I want to create a pdf-file from the user input. The file should be stored in my azure storage.
I have a "pdf-template" which should be copied and modified (this file is in the azure storage already). It has some textboxes which should be filled with the user input. I already wrote the code for that with iTextSharp.
But I need a filestream for this code. Does anybody know how to get the filestream from the file in my azure storage? Or is there maybe another way to finish my task?
Edit:
Here is the code where I need the filestream
string fileNameExisting = Path.Combine(Directory.GetCurrentDirectory(), "Some.pdf");
string fileNameNew = #"Path/Some2.pdf";
var inv = new Invention
{
Inventor = new Inventor { Firstname = "TEST!", Lastname= "TEST!" },
Date = DateTime.Now,
Title = "TEST",
Slogan = "TEST!",
Description = "TEST!",
Advantages = "TEST!s",
TaskPosition = "TEST!",
TaskSolution = "TEST!"
};
using (var existingFileStream = new FileStream(fileNameExisting, FileMode.Open))
using (var newFileStream = new FileStream(fileNameNew, FileMode.Create))
{
// Open existing PDF
var pdfReader = new PdfReader(existingFileStream);
// PdfStamper, which will create
var stamper = new PdfStamper(pdfReader, newFileStream);
var form = stamper.AcroFields;
var fieldKeys = form.Fields.Keys;
foreach (string fieldKey in fieldKeys)
{
var props = fieldKey.Split('.');
string t = GetProp(props, inv);
form.SetField(fieldKey, t);
}
stamper.Close();
pdfReader.Close();
}
}
public static string GetProp(string[] classes, object oldObj)
{
var obj = oldObj.GetType().GetProperty(classes[0]).GetValue(oldObj, null);
if(classes.Length>1)
{
classes = classes.Skip(1).ToArray();
return GetProp(classes, obj);
}
Console.WriteLine(obj.ToString());
return obj.ToString();
}
The PdfReader constructor also takes a byte array. You should be able to create the object using something like:
var pdfTemplateBytes = await new WebClient().DownloadDataTaskAsync("https://myaccount.blob.core.windows.net/templates/mytemplate.pdf");
var pdfReader = new PdfReader(pdfTemplateBytes );
I have a function to writes a bunch of "ImageSignatures" to a DbContext:
using (var db = new ImageContext())
{
foreach (var file in files)
{
var sig = new ImageSignature
{
FileName = file,
Signature = Signature(file),
};
Console.WriteLine("{0}: {1}", Path.GetFileName(sig.FileName), Sig2Str(sig.Signature));
if (sig.Signature != null)
{
db.Images.Add(sig);
}
}
try
{
records = db.SaveChanges(); // where the heck is this saving to!?
}
...
Where the Signature property is defined as
[MinLength(420)]
[MaxLength(420)]
[Required]
public sbyte[] Signature { get; set; }
If I put a breakpoint just before I Add the sig, I can see that it's not null, but a 420 byte array as I expect.
On a later run of the application I try to loop over the ImageSignatures I inserted,
foreach (var img1 in db.Images)
{
var set = new List<string> { img1.FileName };
foreach (var img2 in db.Images)
{
if (Distance(img1.Signature, img2.Signature) < 0.6)
{
set.Add(img2.FileName);
}
}
if (set.Count > 1)
{
dupeSets.Add(set);
}
}
But Signature is always coming back as null. What happened to it? How did it become null, when it wasn't null when I saved it?
Make sure you have saved the changes to your DbContext before disposing it which will ensure that those changes got persisted in the underlying data store:
using (var db = new ImageContext())
{
foreach (var file in files)
{
var sig = new ImageSignature
{
FileName = file,
Signature = Signature(file),
};
Console.WriteLine("{0}: {1}", Path.GetFileName(sig.FileName), Sig2Str(sig.Signature));
db.Images.Add(sig);
}
db.SaveChanges();
}