C# itext 7.1.4 (NuGet release) doesn't seem to parse OCG/layer titles correctly.
The C# code below should read a pdf, print all layer titles, turn off the layer visibility and save it to the dest file.
Example pdf file: https://docdro.id/qI479di
using iText.Kernel.Pdf;
using System;
namespace PDFSetOCGVisibility
{
class Program
{
static void Main(string[] args)
{
var src = #"layer-example.pdf";
var dest = #"layer-example-out.pdf"; ;
PdfDocument pdf = new PdfDocument(new PdfReader(src), new PdfWriter(dest));
var Catalog = pdf.GetCatalog();
var ocProps = Catalog.GetOCProperties(false);
var layers = ocProps.GetLayers();
foreach(var layer in layers)
{
var title = layer.GetTitle();
Console.WriteLine($"title: {title ?? "null"}");
layer.SetOn(false);
}
pdf.Close();
}
}
}
Expected output is:
title: Layer 1
title: Layer 2
Actual output is:
title: null
title: null
Writing the file with disabled layers works fine but the layer titles are always null.
Just tested the itext5 version:
using iTextSharp.text.pdf;
using System;
using System.IO;
namespace PDFSetOCGVisibility5
{
class Program
{
static void Main(string[] args)
{
var src = #"layer-example.pdf";
var dest = #"layer-example-out.pdf";
var reader = new PdfReader(src);
PdfStamper pdf = new PdfStamper(reader, new FileStream(dest, FileMode.Create));
var layers = pdf.GetPdfLayers();
foreach (var layer in layers)
{
var title = layer.Key;
Console.WriteLine($"title: {title ?? "null"}");
layer.Value.On = false;
}
pdf.Close();
reader.Close();
}
}
}
It's working as expected, so this seems to be a regression in itext7
I don't know what's the purpose of title/GetTitle() but to get the Name (as displayed on the panel) the following code works:
var title = layer.GetPdfObject().GetAsString(PdfName.Name).ToUnicodeString();
Related
I am using iText to read the author and subject from stamp annotations.
If the annotation author includes non-ASCII characters (e.g. "äüö"), they are read as follows:
Anton M�ller
My code:
using System;
using System.IO;
using iText.Kernel.Pdf;
namespace iText7Test
{
class Program
{
static void Main(string[] args)
{
Stream inputStream = File.OpenRead(#"Stamp_Anton_Mueller.pdf");
PdfDocument annoPdf = new PdfDocument(new PdfReader(inputStream));
for (int iPage = 1; iPage <= annoPdf.GetNumberOfPages(); iPage++)
{
PdfPage annoPage = annoPdf.GetPage(iPage);
var annotations = annoPage.GetAnnotations();
foreach (var annot in annotations)
{
PdfDictionary annoDict = annot.GetPdfObject();
if ("/Stamp" != annoDict.Get(PdfName.IT, true)?.ToString())
continue;
var subject = annoDict.Get(PdfName.Subj, true);
var author = annoDict.Get(PdfName.T); // this reads "Anton M�ller"
var creationDate = annoDict.Get(PdfName.CreationDate, false);
Console.WriteLine("\nAuthor of Stamp_Anton_Mueller.pdf: {0}", author); // this writes: "Author of Stamp_Anton_Mueller.pdf: Anton M?ller"
}
}
}
}
}
If I simply load the inputStream into a string, the resulting string has the same � issues.
string myPdfString = new StreamReader(inputStream).ReadToEnd();
However, if I set the Encoding parameter of the StreamReader, the Umlaut is shown correctly.
string encodedPdfString = new StreamReader(inputStream, Encoding.Default).ReadToEnd();
I did not see any option to choose Encoding for the PdfReader
Sample PDF: https://drive.google.com/file/d/1_bs47kSkITX1SdDYllVBQPhRP4D3xUAw/view
Hi I want to convert a shapefile (shp) to kml using Gdal library in C#.
I write a code but the output is not in kml format.
Here is my code:
using OSGeo.OGR;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using OSGeo.OSR;
using OSGeo.GDAL;
namespace ConsoleApp1 {
class Program {
static void Main(string[] args) {
GdalConfiguration.ConfigureGdal();
GdalConfiguration.ConfigureOgr();
convert();
}
public static void convert() {
string shapeFilePath = # "C:\riv1.shp";
Ogr.RegisterAll();
var drv = Ogr.GetDriverByName("ESRI Shapefile");
var ds = drv.Open(shapeFilePath, 0);
OSGeo.OGR.Layer layer = ds.GetLayerByIndex(0);
OSGeo.OGR.Feature f;
layer.ResetReading();
System.Text.StringBuilder sb = new System.Text.StringBuilder();
while ((f = layer.GetNextFeature()) != null) {
var geom = f.GetGeometryRef();
if (geom != null) {
var geometryKml = geom.ExportToKML("");
sb.AppendLine(geometryKml);
}
}
var kmlStr = sb.ToString();
System.IO.File.WriteAllText("c:/riv1.kml", kmlStr);
}
}
}
This convert work fine by FWTools Shell but I need to do it in my code.
Please help me if you know what I miss.
You can use the CopyLayer() method to just copy the shapefile layer to a new Kml datasource.
// load the shapefile in a datasoure
Driver shpDriver = Ogr.GetDriverByName("ESRI Shapefile");
DataSource shpDatasource = Ogr.Open(shapefilePath, 0);
if (shpDatasource == null)
return false;
// load the shapefile layer
Layer shpLayer = shpDatasource.GetLayerByIndex(0);
// create the KML datasource layer
Driver kmlDriver = Ogr.GetDriverByName("KML");
DataSource KmlDatasource = Ogr.Open(KmlPath, 0);
KmlDatasource = kmlDriver.CreateDataSource(KmlPath, new string[] {});
// copy the shapefile layer
Layer newLayer = KmlDatasource.CopyLayer(shpLayer, shpLayer.GetName(), new string[] { });
Thank you very much Maxwell77. I just ran it and add tiny modification to let it work correctly.
GdalConfiguration.ConfigureGdal();
GdalConfiguration.ConfigureOgr();
OSGeo.OGR.Ogr.RegisterAll();
Driver drv = Ogr.GetDriverByName("ESRI Shapefile");
DataSource shpDatasource = Ogr.Open(shapefilePath, 0);
if (shpDatasource == null)
return false;
// load the shapefile layer
Layer shpLayer = shpDatasource.GetLayerByIndex(0);
// create the KML datasource layer
using (Driver kmlDriver = Ogr.GetDriverByName("KML"))
{
// DataSource KmlDatasource = Ogr.Open(KmlPath, 0);
using (DataSource KmlDatasource = kmlDriver.CreateDataSource(KmlPath, new string[] { }))
{
// copy the shapefile layer
Layer newLayer = KmlDatasource.CopyLayer(shpLayer, shpLayer.GetName(), new string[] { });
newLayer.Dispose();
}
}
}
I'm trying to scrape a website - ive accomplished this on other projects but i cant seem to get this right. It could be that ive been up for over 2 days working and maybe i am missing something. Please could someone look over my code? Here it is :
using System;
using System.Collections.Generic;
using HtmlAgilityPack;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Linq;
using System.Xml.Linq;
using System.IO;
public partial class _Default : System.Web.UI.Page
{
List<string> names = new List<string>();
List<string> address = new List<string>();
List<string> number = new List<string>();
protected void Page_Load(object sender, EventArgs e)
{
string url = "http://www.scoot.co.uk/find/" + "cafe" + " " + "-in-uk?page=" + "4";
var Webget = new HtmlWeb();
var doc = Webget.Load(url);
List<List<string>> mainList = new List<List<string>>();
foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//h2//a"))
{
names.Add(Regex.Replace(node.ChildNodes[0].InnerHtml, #"\s{2,}", " "));
}
foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//p[#class='result-address']"))
{
address.Add(Regex.Replace(node.ChildNodes[0].InnerHtml, #"\s{2,}", " "));
}
foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//p[#class='result-number']"))
{
number.Add(Regex.Replace(node.ChildNodes[0].InnerHtml, #"\s{2,}", " "));
}
XDocument doccy = new XDocument(
new XDeclaration("1.0", "utf-8", "yes"),
new XComment("Business For Sale"),
new XElement("Data",
from data in mainList
select new XElement("data", new XAttribute("data", "data"),
new XElement("Name : ", names[0]),
new XElement("Add : ", address[0]),
new XElement("Number : ", number[0])
)
)
);
var xml = doccy.ToString();
Response.ContentType = "text/xml"; //Must be 'text/xml'
Response.ContentEncoding = System.Text.Encoding.UTF8; //We'd like UTF-8
doccy.Save(Response.Output); //Save to the text-writer
}
}
The website lists business name, phone number and address and they are all defined by a class name (result-address, result-number etc). I am trying to get XML output so i can get the business name, address and phone number from each listing on page 4 for a presentation tomorrow but i cant get it to work at all!
The results are right in all 3 of the for each loops but they wont output in the xml i get an out of range error.
My first piece of advice would be to keep your CodeBehind as light as possible. If you bloat it up with business logic then the solution will become difficult to maintain. That's off topic, but I recommend looking up SOLID principles.
First, I've created a custom object to work with instead of using Lists of strings which have no way of knowing which address item links up with which name:
public class Listing
{
public string Name { get; set; }
public string Address { get; set; }
public string Number { get; set; }
}
Here is the heart of it, a class that does all the scraping and serializing (I've broken SOLID principles but sometimes you just want it to work right.)
using System.Collections.Generic;
using HtmlAgilityPack;
using System.IO;
using System.Xml;
using System.Xml.Serialization;
using System.Linq;
public class TheScraper
{
public List<Listing> DoTheScrape(int pageNumber)
{
List<Listing> result = new List<Listing>();
string url = "http://www.scoot.co.uk/find/" + "cafe" + " " + "-in-uk?page=" + pageNumber;
var Webget = new HtmlWeb();
var doc = Webget.Load(url);
// select top level node, this is the closest we can get to the elements in which all the listings are a child of.
var nodes = doc.DocumentNode.SelectNodes("//*[#id='list']/div/div/div/div");
// loop through each child
if (nodes != null)
{
foreach (var node in nodes)
{
Listing listing = new Listing();
// get each individual listing and manually check for nulls
// listing.Name = node.SelectSingleNode("./div/div/div/div/h2/a")?.InnerText; --easier way to null check if you can use null propagating operator
var nameNode = node.SelectSingleNode("./div/div/div/div/h2/a");
if (nameNode != null) listing.Name = nameNode.InnerText;
var addressNode = node.SelectSingleNode("./div/div/div/div/p[#class='result-address']");
if (addressNode != null) listing.Address = addressNode.InnerText.Trim();
var numberNode = node.SelectSingleNode("./div/div/div/div/p[#class='result-number']/a");
if (numberNode != null) listing.Number = numberNode.Attributes["data-visible-number"].Value;
result.Add(listing);
}
}
// filter out the nulls
result = result.Where(x => x.Name != null && x.Address != null && x.Number != null).ToList();
return result;
}
public string SerializeTheListings(List<Listing> listings)
{
var xmlSerializer = new XmlSerializer(typeof(List<Listing>));
using (var stringWriter = new StringWriter())
using (var xmlWriter = XmlWriter.Create(stringWriter, new XmlWriterSettings { Indent = true }))
{
xmlSerializer.Serialize(xmlWriter, listings);
return stringWriter.ToString();
}
}
}
Then your code behind would look something like this, plus references to the scraper class and model class:
public partial class _Default : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
TheScraper scraper = new TheScraper();
List<Listing> listings = new List<Listing>();
// quick hack to do a loop 5 times, to get all 5 pages. if this is being run frequently you'd want to automatically identify how many pages or start at page one and find / use link to next page.
for (int i = 0; i < 5; i++)
{
listings = listings.Union(scraper.DoTheScrape(i)).ToList();
}
string xmlListings = scraper.SerializeTheListings(listings);
}
}
In DocumentDB documentation examples, I find insertion of C# objects.
// Create the Andersen family document.
Family AndersenFamily = new Family
{
Id = "AndersenFamily",
LastName = "Andersen",
Parents = new Parent[] {
new Parent { FirstName = "Thomas" },
new Parent { FirstName = "Mary Kay"}
},
IsRegistered = true
};
await client.CreateDocumentAsync(documentCollection.DocumentsLink, AndersenFamily);
In my case, I'm receiving json strings from application client and would like to insert them in DocumentDB without deserializing them. Could not find any examples of doing something similar.
Any help is sincerely appreciated..
Thanks
Copied from the published .NET Sample code -
private static async Task UseStreams(string colSelfLink)
{
var dir = new DirectoryInfo(#".\Data");
var files = dir.EnumerateFiles("*.json");
foreach (var file in files)
{
using (var fileStream = new FileStream(file.FullName, FileMode.Open, FileAccess.Read))
{
Document doc = await client.CreateDocumentAsync(colSelfLink, Resource.LoadFrom<Document>(fileStream));
Console.WriteLine("Created Document: ", doc);
}
}
//Read one the documents created above directly in to a Json string
Document readDoc = client.CreateDocumentQuery(colSelfLink).Where(d => d.Id == "JSON1").AsEnumerable().First();
string content = JsonConvert.SerializeObject(readDoc);
//Update a document with some Json text,
//Here we're replacing a previously created document with some new text and even introudcing a new Property, Status=Cancelled
using (var memoryStream = new MemoryStream(Encoding.UTF8.GetBytes("{\"id\": \"JSON1\",\"PurchaseOrderNumber\": \"PO18009186470\",\"Status\": \"Cancelled\"}")))
{
await client.ReplaceDocumentAsync(readDoc.SelfLink, Resource.LoadFrom<Document>(memoryStream));
}
}
We have conditional Footers that INCLUDETEXT based on the client:
IF $CLIENT = "CLIENT1" "{INCLUDETEXT "CLIENT1HEADER.DOCX"}" ""
Depending on our document, there could be a varying amount of IF/ELSE, and these all work correctly for merging the correct files in the correct place.
However, some of these documents may have client specific images/branding, which also need to be copied across from the INCLUDETEXT file.
Below is the method that is called to replace any Picture elements that exist in the IEnumerable<Run> that is copied from the Source document to the Target document.
The image is copied fine, however it doesn't appear to update the RID in my Picture or add a record into the .XML.Rels files. (I even tried adding a ForEach to add to all the headers and footers, to see if this made any difference.
private void InsertImagesFromOldDocToNewDoc(WordprocessingDocument source, WordprocessingDocument target, IEnumerable<Picture> pics)
{
IEnumerable<Picture> imageElements = source.MainDocumentPart.Document.Descendants<Run>().Where(x => x.Descendants<Picture>().FirstOrDefault() != null).Select(x => x.Descendants<Picture>().FirstOrDefault());
foreach (Picture pic in pics) //the new pics
{
Picture oldPic = imageElements.Where(x => x.Equals(pic)).FirstOrDefault();
if (oldPic != null)
{
string imageId = "";
ImageData shape = oldPic.Descendants<ImageData>().FirstOrDefault();
ImagePart p = source.MainDocumentPart.GetPartById(shape.RelationshipId) as ImagePart;
ImagePart newPart = target.MainDocumentPart.AddPart<ImagePart>(p);
newPart.FeedData(p.GetStream());
shape.RelId = target.MainDocumentPart.GetIdOfPart(newPart);
string relPart = target.MainDocumentPart.CreateRelationshipToPart(newPart);
}
}
}
Has anyone come across this issue before?
It appears the OpenXML SDK documentation is a 'little' sparse...
Late reaction but this thread helped me a lot to got it working. Here my solution for copying a document with images
private static void CopyDocumentWithImages(string path)
{
if (!Path.GetFileName(path).StartsWith("~$"))
{
using (var source = WordprocessingDocument.Open(path, false))
{
using (var newDoc = source.CreateNew(path.Replace(".docx", "-images.docx")))
{
foreach (var e in source.MainDocumentPart.Document.Body.Elements())
{
var clonedElement = e.CloneNode(true);
clonedElement.Descendants<DocumentFormat.OpenXml.Drawing.Blip>()
.ToList().ForEach(blip =>
{
var newRelation = newDoc.CopyImage(blip.Embed, source);
blip.Embed = newRelation;
});
clonedElement.Descendants<DocumentFormat.OpenXml.Vml.ImageData>().ToList().ForEach(imageData =>
{
var newRelation = newDoc.CopyImage(imageData.RelationshipId, source);
imageData.RelationshipId = newRelation;
});
newDoc.MainDocumentPart.Document.Body.AppendChild(clonedElement);
}
newDoc.Save();
}
}
}
}
CopyImage:
public static string CopyImage(this WordprocessingDocument newDoc, string relId, WordprocessingDocument org)
{
var p = org.MainDocumentPart.GetPartById(relId) as ImagePart;
var newPart = newDoc.MainDocumentPart.AddPart(p);
newPart.FeedData(p.GetStream());
return newDoc.MainDocumentPart.GetIdOfPart(newPart);
}
CreateNew:
public static WordprocessingDocument CreateNew(this WordprocessingDocument org, string name)
{
var doc = WordprocessingDocument.Create(name, WordprocessingDocumentType.Document);
doc.AddMainDocumentPart();
doc.MainDocumentPart.Document = new Document(new Body());
using (var streamReader = new StreamReader(org.MainDocumentPart.ThemePart.GetStream()))
using (var streamWriter = new StreamWriter(doc.MainDocumentPart.AddNewPart<ThemePart>().GetStream(FileMode.Create)))
{
streamWriter.Write(streamReader.ReadToEnd());
}
using (var streamReader = new StreamReader(org.MainDocumentPart.StyleDefinitionsPart.GetStream()))
using (var streamWriter = new StreamWriter(doc.MainDocumentPart.AddNewPart<StyleDefinitionsPart>().GetStream(FileMode.Create)))
{
streamWriter.Write(streamReader.ReadToEnd());
}
return doc;
}
Stuart,
I had faced the same problem when I was trying to copy the numbering styles from one document to the other.
I think what Word does internally is, whenever an object is copied from one document to the other the ID for that object is not copied over to the new document and instead what happens is a new ID is assigned to it.
You'll have to get the ID after the image has been copied and then replace it everywhere your image has been used.
I hope this helps, this is what I to use copy numbering styles.
Cheers