diff options
author | Michele Calgaro <michele.calgaro@yahoo.it> | 2024-05-11 21:28:48 +0900 |
---|---|---|
committer | Michele Calgaro <michele.calgaro@yahoo.it> | 2024-05-11 21:28:48 +0900 |
commit | 2462d03f322261bd616721c2b2065c4004b36c9c (patch) | |
tree | 239947a0737bb8386703a1497f12c09aebd3080a /fbreader/src/formats/doc | |
download | tde-ebook-reader-2462d03f322261bd616721c2b2065c4004b36c9c.tar.gz tde-ebook-reader-2462d03f322261bd616721c2b2065c4004b36c9c.zip |
Initial import (as is) from Debian Snapshot's 'fbreader' source code (https://snapshot.debian.org/package/fbreader/0.99.4%2Bdfsg-6).
The Debian code is provided under GPL2 license.
Signed-off-by: Michele Calgaro <michele.calgaro@yahoo.it>
Diffstat (limited to 'fbreader/src/formats/doc')
24 files changed, 4141 insertions, 0 deletions
diff --git a/fbreader/src/formats/doc/DocBookReader.cpp b/fbreader/src/formats/doc/DocBookReader.cpp new file mode 100644 index 0000000..99f471a --- /dev/null +++ b/fbreader/src/formats/doc/DocBookReader.cpp @@ -0,0 +1,377 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <vector> +#include <string> + +#include <ZLInputStream.h> +#include <ZLLogger.h> +#include <ZLFile.h> +#include <ZLStringUtil.h> +#include <ZLFileImage.h> + +#include "DocBookReader.h" +#include "../../bookmodel/BookModel.h" +#include "../../library/Book.h" + +#include "OleStorage.h" +#include "OleMainStream.h" + +DocBookReader::DocBookReader(BookModel &model, const std::string &encoding) : + myModelReader(model), + myPictureCounter(0), + myEncoding(encoding) { + myReadState = READ_TEXT; +} + +bool DocBookReader::readBook() { + const ZLFile &file = myModelReader.model().book()->file(); + shared_ptr<ZLInputStream> stream = file.inputStream(); + if (stream.isNull() || !stream->open()) { + return false; + } + myModelReader.setMainTextModel(); + myModelReader.pushKind(REGULAR); + myModelReader.beginParagraph(); + + if (!readDocument(stream, true)) { + return false; + } + + myModelReader.insertEndOfTextParagraph(); + return true; +} + +void DocBookReader::handleChar(ZLUnicodeUtil::Ucs2Char ucs2char) { + if (myReadState == READ_FIELD && myReadFieldState == READ_FIELD_INFO) { + myFieldInfoBuffer.push_back(ucs2char); + return; + } + if (myReadState == READ_FIELD && myReadFieldState == DONT_READ_FIELD_TEXT) { + return; + } + if (myReadState == READ_FIELD && myReadFieldState == READ_FIELD_TEXT && ucs2char == WORD_HORIZONTAL_TAB) { + //to remove pagination from TOC (from doc saved in OpenOffice) + myReadFieldState = DONT_READ_FIELD_TEXT; + return; + } + std::string utf8String; + ZLUnicodeUtil::Ucs2String ucs2String; + ucs2String.push_back(ucs2char); + ZLUnicodeUtil::ucs2ToUtf8(utf8String, ucs2String); + if (!myModelReader.paragraphIsOpen()) { + myModelReader.beginParagraph(); + } + myModelReader.addData(utf8String); +} + +void DocBookReader::handleHardLinebreak() { + if (myModelReader.paragraphIsOpen()) { + myModelReader.endParagraph(); + } + myModelReader.beginParagraph(); + if (!myCurrentStyleEntry.isNull()) { + myModelReader.addStyleEntry(*myCurrentStyleEntry); + } + for (std::size_t i = 0; i < myKindStack.size(); ++i) { + myModelReader.addControl(myKindStack.at(i), true); + } +} + +void DocBookReader::handleParagraphEnd() { + if (myModelReader.paragraphIsOpen()) { + myModelReader.endParagraph(); + } + myModelReader.beginParagraph(); + myCurrentStyleEntry = 0; +} + +void DocBookReader::handlePageBreak() { + if (myModelReader.paragraphIsOpen()) { + myModelReader.endParagraph(); + } + myCurrentStyleEntry = 0; + myModelReader.insertEndOfSectionParagraph(); + myModelReader.beginParagraph(); +} + +void DocBookReader::handleTableSeparator() { + handleChar(SPACE); + handleChar(VERTICAL_LINE); + handleChar(SPACE); +} + +void DocBookReader::handleTableEndRow() { + handleParagraphEnd(); +} + +void DocBookReader::handleFootNoteMark() { + //TODO implement +} + +void DocBookReader::handleStartField() { + if (myReadState == READ_FIELD) { //for nested fields + handleEndField(); + } + myReadState = READ_FIELD; + myReadFieldState = READ_FIELD_INFO; + myHyperlinkTypeState = NO_HYPERLINK; +} + +void DocBookReader::handleSeparatorField() { + static const std::string HYPERLINK = "HYPERLINK"; + static const std::string SEQUENCE = "SEQ"; +// static const std::string PAGE = "PAGE"; +// static const std::string PAGEREF = "PAGEREF"; +// static const std::string SHAPE = "SHAPE"; + static const std::string SPACE_DELIMETER = " "; + static const std::string LOCAL_LINK = "\\l"; + static const std::string QUOTE = "\""; + myReadFieldState = READ_FIELD_TEXT; + myHyperlinkTypeState = NO_HYPERLINK; + ZLUnicodeUtil::Ucs2String buffer = myFieldInfoBuffer; + myFieldInfoBuffer.clear(); + std::string utf8String; + ZLUnicodeUtil::ucs2ToUtf8(utf8String, buffer); + ZLUnicodeUtil::utf8Trim(utf8String); + if (utf8String.empty()) { + return; + } + std::vector<std::string> result = ZLStringUtil::split(utf8String, SPACE_DELIMETER); + //TODO split function can returns empty string, maybe fix it + std::vector<std::string> splitted; + for (std::size_t i = 0; i < result.size(); ++i) { + if (!result.at(i).empty()) { + splitted.push_back(result.at(i)); + } + } + + if (!splitted.empty() && splitted.at(0) == SEQUENCE) { + myReadFieldState = READ_FIELD_TEXT; + myHyperlinkTypeState = NO_HYPERLINK; + return; + } + + if (splitted.size() < 2 || splitted.at(0) != HYPERLINK) { + myReadFieldState = DONT_READ_FIELD_TEXT; + //to remove pagination from TOC and not hyperlink fields + return; + } + + if (splitted.at(1) == LOCAL_LINK) { + std::string link = parseLink(buffer); + if (!link.empty()) { + myModelReader.addHyperlinkControl(INTERNAL_HYPERLINK, link); + myHyperlinkTypeState = INT_HYPERLINK_INSERTED; + } + } else { + std::string link = parseLink(buffer, true); + if (!link.empty()) { + myModelReader.addHyperlinkControl(EXTERNAL_HYPERLINK, link); + myHyperlinkTypeState = EXT_HYPERLINK_INSERTED; + } + } +} + +void DocBookReader::handleEndField() { + myFieldInfoBuffer.clear(); + if (myReadState == READ_TEXT) { + return; + } + if (myHyperlinkTypeState == EXT_HYPERLINK_INSERTED) { + myModelReader.addControl(EXTERNAL_HYPERLINK, false); + } else if (myHyperlinkTypeState == INT_HYPERLINK_INSERTED) { + myModelReader.addControl(INTERNAL_HYPERLINK, false); + } + myReadState = READ_TEXT; + myHyperlinkTypeState = NO_HYPERLINK; + +} + +void DocBookReader::handleImage(const ZLFileImage::Blocks &blocks) { + std::string number; + ZLStringUtil::appendNumber(number, myPictureCounter++); + myModelReader.addImageReference(number); + ZLFile file(myModelReader.model().book()->file().path(), ZLMimeType::IMAGE_AUTO); + myModelReader.addImage(number, new ZLFileImage(file, blocks, ZLFileImage::ENCODING_NONE)); +} + +void DocBookReader::handleOtherControlChar(ZLUnicodeUtil::Ucs2Char ucs2char) { + if (ucs2char == WORD_MINUS) { + handleChar(MINUS); + } else if (ucs2char == WORD_SOFT_HYPHEN) { + //skip + } else if (ucs2char == WORD_HORIZONTAL_TAB) { + handleChar(ucs2char); + } else { +// myTextBuffer.clear(); + } +} + +void DocBookReader::handleFontStyle(unsigned int fontStyle) { + if (myReadState == READ_FIELD && myReadFieldState == READ_FIELD_TEXT && myHyperlinkTypeState != NO_HYPERLINK) { + //to fix bug with hyperlink, that's only bold and doesn't looks like hyperlink + return; + } + while (!myKindStack.empty()) { + myModelReader.addControl(myKindStack.back(), false); + myKindStack.pop_back(); + } + if (fontStyle & OleMainStream::CharInfo::FONT_BOLD) { + myKindStack.push_back(BOLD); + } + if (fontStyle & OleMainStream::CharInfo::FONT_ITALIC) { + myKindStack.push_back(ITALIC); + } + for (std::size_t i = 0; i < myKindStack.size(); ++i) { + myModelReader.addControl(myKindStack.at(i), true); + } +} + +void DocBookReader::handleParagraphStyle(const OleMainStream::Style &styleInfo) { + if (styleInfo.HasPageBreakBefore) { + handlePageBreak(); + } + shared_ptr<ZLTextStyleEntry> entry = new ZLTextStyleEntry(ZLTextStyleEntry::STYLE_OTHER_ENTRY); + + switch (styleInfo.Alignment) { + default: // in that case, use default alignment type + break; + case OleMainStream::Style::ALIGNMENT_LEFT: + entry->setAlignmentType(ALIGN_LEFT); + break; + case OleMainStream::Style::ALIGNMENT_RIGHT: + entry->setAlignmentType(ALIGN_RIGHT); + break; + case OleMainStream::Style::ALIGNMENT_CENTER: + entry->setAlignmentType(ALIGN_CENTER); + break; + case OleMainStream::Style::ALIGNMENT_JUSTIFY: + entry->setAlignmentType(ALIGN_JUSTIFY); + break; + } + + //TODO in case, where style is heading, but size is small it works wrong + const ZLTextStyleEntry::SizeUnit unit = ZLTextStyleEntry::SIZE_UNIT_PERCENT; + switch (styleInfo.StyleIdCurrent) { + default: + break; + case OleMainStream::Style::STYLE_H1: + entry->setLength(ZLTextStyleEntry::LENGTH_FONT_SIZE, 140, unit); + break; + case OleMainStream::Style::STYLE_H2: + entry->setLength(ZLTextStyleEntry::LENGTH_FONT_SIZE, 120, unit); + break; + case OleMainStream::Style::STYLE_H3: + entry->setLength(ZLTextStyleEntry::LENGTH_FONT_SIZE, 110, unit); + break; + } + myCurrentStyleEntry = entry; + myModelReader.addStyleEntry(*myCurrentStyleEntry); + + // we should have the same font style, as for the previous paragraph, + // if it has the same StyleIdCurrent + if (myCurrentStyleInfo.StyleIdCurrent != OleMainStream::Style::STYLE_INVALID && + myCurrentStyleInfo.StyleIdCurrent == styleInfo.StyleIdCurrent) { + for (std::size_t i = 0; i < myKindStack.size(); ++i) { + myModelReader.addControl(myKindStack.at(i), true); + } + } else { + myKindStack.clear(); + // fill by the fontstyle, that was got from Stylesheet + handleFontStyle(styleInfo.CurrentCharInfo.FontStyle); + } + myCurrentStyleInfo = styleInfo; +} + +void DocBookReader::handleBookmark(const std::string &name) { + myModelReader.addHyperlinkLabel(name); +} + +std::string DocBookReader::parseLink(ZLUnicodeUtil::Ucs2String s, bool urlencode) { + //TODO add support for HYPERLINK like that: + // [0x13] HYPERLINK "http://site.ru/some text" \t "_blank" [0x14] text [0x15] + //Current implementation search for last QUOTE, so, it reads \t and _blank as part of link + //Last quote searching is need to handle link like that: + // [0x13] HYPERLINK "http://yandex.ru/yandsearch?text='some text' и "some text2"" [0x14] link text [0x15] + + static const ZLUnicodeUtil::Ucs2Char QUOTE = 0x22; + std::size_t i, first = 0; + //TODO maybe functions findFirstOf and findLastOf should be in ZLUnicodeUtil class + for (i = 0; i < s.size(); ++i) { + if (s.at(i) == QUOTE) { + first = i; + break; + } + } + if (i == s.size()) { + return std::string(); + } + std::size_t j, last = 0; + for (j = s.size(); j > 0 ; --j) { + if (s.at(j - 1) == QUOTE) { + last = j - 1; + break; + } + } + if (j == 0 || last == first) { + return std::string(); + } + + ZLUnicodeUtil::Ucs2String link; + for (std::size_t k = first + 1; k < last; ++k) { + ZLUnicodeUtil::Ucs2Char ch = s.at(k); + if (urlencode && ZLUnicodeUtil::isSpace(ch)) { + //TODO maybe implement function for encoding all signs in url, not only spaces and quotes + //TODO maybe add backslash support + link.push_back('%'); + link.push_back('2'); + link.push_back('0'); + } else if (urlencode && ch == QUOTE) { + link.push_back('%'); + link.push_back('2'); + link.push_back('2'); + } else { + link.push_back(ch); + } + } + std::string utf8String; + ZLUnicodeUtil::ucs2ToUtf8(utf8String, link); + return utf8String; +} + +void DocBookReader::footnotesStartHandler() { + handlePageBreak(); +} + +void DocBookReader::ansiDataHandler(const char *buffer, std::size_t len) { + if (myConverter.isNull()) { + // lazy converter initialization + ZLEncodingCollection &collection = ZLEncodingCollection::Instance(); + ZLEncodingConverterInfoPtr info = collection.info(myEncoding); + myConverter = info.isNull() ? collection.defaultConverter() : info->createConverter(); + } + std::string utf8String; + myConverter->convert(utf8String, buffer, buffer + len); + ZLUnicodeUtil::utf8ToUcs2(myBuffer, utf8String); +} + +void DocBookReader::ucs2SymbolHandler(ZLUnicodeUtil::Ucs2Char symbol) { + myBuffer.push_back(symbol); +} diff --git a/fbreader/src/formats/doc/DocBookReader.h b/fbreader/src/formats/doc/DocBookReader.h new file mode 100644 index 0000000..d80fb8e --- /dev/null +++ b/fbreader/src/formats/doc/DocBookReader.h @@ -0,0 +1,103 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __DOCBOOKREADER_H__ +#define __DOCBOOKREADER_H__ + +#include <vector> + +#include <shared_ptr.h> +#include <ZLFile.h> +#include <ZLTextStyleEntry.h> +#include <ZLEncodingConverter.h> + +#include "../../bookmodel/BookReader.h" + +#include "OleMainStream.h" +#include "OleStreamParser.h" + +class DocBookReader : public OleStreamParser { + +public: + DocBookReader(BookModel &model, const std::string &encoding); + ~DocBookReader(); + bool readBook(); + +private: + void ansiDataHandler(const char *buffer, std::size_t len); + void ucs2SymbolHandler(ZLUnicodeUtil::Ucs2Char symbol); + void footnotesStartHandler(); + + void handleChar(ZLUnicodeUtil::Ucs2Char ucs2char); + void handleHardLinebreak(); + void handleParagraphEnd(); + void handlePageBreak(); + void handleTableSeparator(); + void handleTableEndRow(); + void handleFootNoteMark(); + void handleStartField(); + void handleSeparatorField(); + void handleEndField(); + void handleImage(const ZLFileImage::Blocks &blocks); + void handleOtherControlChar(ZLUnicodeUtil::Ucs2Char ucs2char); + + //formatting: + void handleFontStyle(unsigned int fontStyle); + void handleParagraphStyle(const OleMainStream::Style &styleInfo); + void handleBookmark(const std::string &name); + +private: + static std::string parseLink(ZLUnicodeUtil::Ucs2String s, bool urlencode = false); + +private: + BookReader myModelReader; + + ZLUnicodeUtil::Ucs2String myFieldInfoBuffer; + + enum { + READ_FIELD, + READ_TEXT + } myReadState; + + enum { + READ_FIELD_TEXT, + DONT_READ_FIELD_TEXT, + READ_FIELD_INFO + } myReadFieldState; + + //maybe it should be flag? + enum { + NO_HYPERLINK, + EXT_HYPERLINK_INSERTED, + INT_HYPERLINK_INSERTED + } myHyperlinkTypeState; + + //formatting + std::vector<FBTextKind> myKindStack; + shared_ptr<ZLTextStyleEntry> myCurrentStyleEntry; + OleMainStream::Style myCurrentStyleInfo; + unsigned int myPictureCounter; + + const std::string myEncoding; + shared_ptr<ZLEncodingConverter> myConverter; +}; + +inline DocBookReader::~DocBookReader() {} + +#endif /* __DOCBOOKREADER_H__ */ diff --git a/fbreader/src/formats/doc/DocFloatImageReader.cpp b/fbreader/src/formats/doc/DocFloatImageReader.cpp new file mode 100644 index 0000000..8c308e4 --- /dev/null +++ b/fbreader/src/formats/doc/DocFloatImageReader.cpp @@ -0,0 +1,384 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLLogger.h> + +#include "OleUtil.h" +#include "OleStream.h" +#include "OleMainStream.h" + +#include "DocFloatImageReader.h" + +DocFloatImageReader::DocFloatImageReader(unsigned int off, unsigned int len, shared_ptr<OleStream> tableStream, shared_ptr<OleStream> mainStream) : + myTableStream(tableStream), + myMainStream(mainStream), + myOffset(off), + myLength(len) { +} + +void DocFloatImageReader::readAll() { + //OfficeArtContent structure is described at p.405-406 [MS-DOC] + if (!myTableStream->seek(myOffset, true)) { + ZLLogger::Instance().println("DocPlugin", "problems with reading float images"); + return; + } + + unsigned int count = 0; + + RecordHeader header; + while (count < myLength) { + count += readRecordHeader(header, myTableStream); + switch (header.type) { + case 0xF000: + count += readDggContainer(myItem, header.length, myTableStream, myMainStream); + break; + case 0xF002: + count += readDgContainer(myItem, header.length, myTableStream); + break; + default: + return; + break; + } + } +} + +ZLFileImage::Blocks DocFloatImageReader::getBlocksForShapeId(unsigned int shapeId) const { + FSPContainer container; + bool found = false; + for (std::size_t i = 0; !found && i < myItem.FSPs.size(); ++i) { + if (myItem.FSPs.at(i).fsp.shapeId == shapeId) { + found = true; + container = myItem.FSPs.at(i); + } + } + + if (!found || container.fopte.empty()) { + return ZLFileImage::Blocks(); + } + + for (std::size_t i = 0; i < container.fopte.size(); ++i) { + const FOPTE &fopte = container.fopte.at(i); + if (fopte.pId == 0x0104 && !fopte.isComplex) { //0x0104 specifies the BLIP, see p.420 [MS-ODRAW] + if (fopte.value <= myItem.blips.size() && fopte.value > 0) { + Blip blip = myItem.blips.at(fopte.value - 1); + return blip.blocks; + } + } + } + return ZLFileImage::Blocks(); +} + +unsigned int DocFloatImageReader::readRecordHeader(RecordHeader &header, shared_ptr<OleStream> stream) { + //OfficeArtRecordHeader structure is described at p.26 [MS-ODRAW] + char buffer[8]; + stream->read(buffer, 8); + unsigned int temp = OleUtil::getU2Bytes(buffer, 0); + header.version = temp & 0x000F; + header.instance = temp >> 4; + header.type = OleUtil::getU2Bytes(buffer, 2); + header.length = OleUtil::getU4Bytes(buffer, 4); + return 8; +} + +unsigned int DocFloatImageReader::readDggContainer(OfficeArtContent &item, unsigned int length, shared_ptr<OleStream> stream, shared_ptr<OleStream> mainStream) { + //OfficeArtDggContainer structure is described at p.50 [MS-ODRAW] + RecordHeader header; + unsigned int count = 0; + + while (count < length) { + count += readRecordHeader(header, stream); + switch (header.type) { + case 0xF001: + count += readBStoreContainer(item, header.length, stream, mainStream); + break; + default: + count += skipRecord(header, stream); + break; + } + } + + stream->seek(1, false); //skipping dgglbl (see p.406 [MS-DOC]) + ++count; + + return count; +} + +unsigned int DocFloatImageReader::readBStoreContainer(OfficeArtContent &item, unsigned int length, shared_ptr<OleStream> stream, shared_ptr<OleStream> mainStream) { + //OfficeArtBStoreContainer structure is described at p.58 [MS-ODRAW] + RecordHeader header; + unsigned int count = 0; + while (count < length) { + count += readRecordHeader(header, stream); + switch (header.type) { + case 0xF007: + { + Blip blip; + count += readBStoreContainerFileBlock(blip, stream, mainStream); + item.blips.push_back(blip); + } + break; + default: + count += skipRecord(header, stream); + break; + } + } + return count; +} + +unsigned int DocFloatImageReader::skipRecord(const RecordHeader &header, shared_ptr<OleStream> stream) { + stream->seek(header.length, false); + return header.length; +} + +unsigned int DocFloatImageReader::readBStoreContainerFileBlock(Blip &blip, shared_ptr<OleStream> stream, shared_ptr<OleStream> mainStream) { + //OfficeArtBStoreContainerFileBlock structure is described at p.59 [MS-ODRAW] + unsigned int count = readFBSE(blip.storeEntry, stream); + if (blip.storeEntry.offsetInDelay != (unsigned int)-1) { + if (mainStream->seek(blip.storeEntry.offsetInDelay, true)) { //see p.70 [MS-ODRAW] + //TODO maybe we should stop reading float images here + ZLLogger::Instance().println("DocPlugin", "DocFloatImageReader: problems with seeking for offset"); + return count; + } + } + RecordHeader header; + unsigned int count2 = readRecordHeader(header, mainStream); + switch (header.type) { + case OleMainStream::IMAGE_WMF: + case OleMainStream::IMAGE_EMF: + case OleMainStream::IMAGE_PICT: + count2 += skipRecord(header, mainStream); + break; + case OleMainStream::IMAGE_JPEG: + case OleMainStream::IMAGE_JPEG2: + case OleMainStream::IMAGE_PNG: + case OleMainStream::IMAGE_DIB: + case OleMainStream::IMAGE_TIFF: + count2 += readBlip(blip, header, mainStream); + break; + } + blip.type = header.type; + return count; +} + +unsigned int DocFloatImageReader::readBlip(Blip &blip, const RecordHeader &header, shared_ptr<OleStream> stream) { + //OfficeArtBlip structure is described at p.60-66 [MS-ODRAW] + stream->seek(16, false); //skipping rgbUid1 + unsigned int count = 16; + + bool addField = false; + switch (header.type) { + case OleMainStream::IMAGE_PNG: + if (header.instance == 0x6E1) { + addField = true; + } + break; + case OleMainStream::IMAGE_JPEG: + case OleMainStream::IMAGE_JPEG2: + if (header.instance == 0x46B || header.instance == 0x6E3) { + addField = true; + } + break; + case OleMainStream::IMAGE_DIB: + if (header.instance == 0x7A9) { + addField = true; + } + case OleMainStream::IMAGE_TIFF: + if (header.instance == 0x6E5) { + addField = true; + } + break; + } + + if (addField) { + stream->seek(16, false); //skipping rgbUid2 + count += 16; + } + stream->seek(1, false); //skipping tag + count += 1; + + blip.blocks = stream->getBlockPieceInfoList(stream->offset(), header.length - count); + count += header.length; + return count; +} + +unsigned int DocFloatImageReader::readFBSE(BlipStoreEntry &fbse, shared_ptr<OleStream> stream) { + //OfficeArtFBSE structure is described at p.68 [MS-ODRAW] + stream->seek(2, false); //skipping btWin32 and btMacOS + stream->seek(16, false); //skipping rgbUid + stream->seek(2, false); //skipping tag + fbse.size = read4Bytes(stream); + fbse.referenceCount = read4Bytes(stream); + fbse.offsetInDelay = read4Bytes(stream); + stream->seek(1, false); //skipping unused value + unsigned int lengthName = read1Byte(stream); //if it should be multiplied on 2? + stream->seek(2, false); // skipping unused values + if (lengthName > 0) { + stream->seek(lengthName, false); //skipping nameData + } + return 36 + lengthName; +} + +unsigned int DocFloatImageReader::readDgContainer(OfficeArtContent &item, unsigned int length, shared_ptr<OleStream> stream) { + //OfficeArtDgContainer structure is described at p.52 [MS-ODRAW] + unsigned int count = 0; + + RecordHeader header; + while (count < length) { + count += readRecordHeader(header, stream); + switch (header.type) { + case 0xF008: //skip OfficeArtFDG record, p. 82 [MS-ODRAW] + stream->seek(8, false); + count += 8; + break; + case 0xF003: + count += readSpgrContainer(item, header.length, stream); + break; + case 0xF004: + { + FSPContainer fspContainer; + count += readSpContainter(fspContainer, header.length, stream); + item.FSPs.push_back(fspContainer); + } + break; + default: + count += skipRecord(header, stream); + break; + } + } + return count; +} + +unsigned int DocFloatImageReader::readSpgrContainer(OfficeArtContent &item, unsigned int length, shared_ptr<OleStream> stream) { + //OfficeArtSpgrContainer structure is described at p.56 [MS-ODRAW] + unsigned count = 0; + RecordHeader header; + while (count < length) { + count += readRecordHeader(header, stream); + switch (header.type) { + case 0xF003: + count += readSpgrContainer(item, header.length, stream); + break; + case 0xF004: + { + FSPContainer fspContainer; + count += readSpContainter(fspContainer, header.length, stream); + item.FSPs.push_back(fspContainer); + } + break; + default: + count += skipRecord(header, stream); + break; + } + } + return count; +} + +unsigned int DocFloatImageReader::readSpContainter(FSPContainer &item, unsigned int length, shared_ptr<OleStream> stream) { + //OfficeArtSpContainter structure is described at p.53-55 [MS-ODRAW] + RecordHeader header; + unsigned int count = 0; + while (count < length) { + count += readRecordHeader(header, stream); + switch (header.type) { + case 0xF009: //skip OfficeArtFSPGR record, p.74 [MS-ODRAW] + stream->seek(16, false); + count += 16; + break; + case 0xF00A: + count += readFSP(item.fsp, stream); + break; + case 0xF00B: + count += readArrayFOPTE(item.fopte, header.length, stream); + break; + case 0xF00E: //OfficeArtAnchor + case 0xF00F: //OfficeArtChildAnchor, p.75 [MS-ODRAW] + case 0xF010: //OfficeArtClientAnchor + stream->seek(4, false); + count += 4; + break; + case 0xF00C: + case 0xF11F: + case 0xF11D: + break; + default: + count += skipRecord(header, stream); + break; + } + } + return count; +} + +unsigned int DocFloatImageReader::readFSP(FSP &fsp, shared_ptr<OleStream> stream) { + //OfficeArtFSP structure is described at p.76 [MS-ODRAW] + fsp.shapeId = read4Bytes(stream); + stream->seek(4, false); + return 8; +} + +unsigned int DocFloatImageReader::readArrayFOPTE(std::vector<FOPTE> &fopteArray,unsigned int length, shared_ptr<OleStream> stream) { + //OfficeArtRGFOPTE structure is described at p.98 [MS-ODRAW] + unsigned int count = 0; + while (count < length) { + FOPTE fopte; + count += readFOPTE(fopte, stream); + fopteArray.push_back(fopte); + } + for (std::size_t i = 0; i < fopteArray.size(); ++i) { + if (fopteArray.at(i).isComplex) { + stream->seek(fopteArray.at(i).value, false); + count += fopteArray.at(i).value; + } + } + return count; +} + +unsigned int DocFloatImageReader::readFOPTE(FOPTE &fopte, shared_ptr<OleStream> stream) { + //OfficeArtFOPTE structure is described at p.32 [MS-ODRAW] + unsigned int dtemp; + dtemp = read2Bytes(stream); + fopte.pId = (dtemp & 0x3fff); + fopte.isBlipId = ((dtemp & 0x4000) >> 14) == 0x1; + fopte.isComplex = ((dtemp & 0x8000) >> 15) == 0x1; + fopte.value = read4Bytes(stream); + return 6; +} + +unsigned int DocFloatImageReader::read1Byte(shared_ptr<OleStream> stream) { + char b[1]; + if (stream->read(b, 1) != 1) { + return 0; + } + return OleUtil::getU1Byte(b, 0); +} + +unsigned int DocFloatImageReader::read2Bytes(shared_ptr<OleStream> stream) { + char b[2]; + if (stream->read(b, 2) != 2) { + return 0; + } + return OleUtil::getU2Bytes(b, 0); +} + +unsigned int DocFloatImageReader::read4Bytes(shared_ptr<OleStream> stream) { + char b[4]; + if (stream->read(b, 4) != 4) { + return 0; + } + return OleUtil::getU4Bytes(b, 0); +} diff --git a/fbreader/src/formats/doc/DocFloatImageReader.h b/fbreader/src/formats/doc/DocFloatImageReader.h new file mode 100644 index 0000000..d2d6c2e --- /dev/null +++ b/fbreader/src/formats/doc/DocFloatImageReader.h @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __DOCFLOATIMAGEREADER_H__ +#define __DOCFLOATIMAGEREADER_H__ + +#include <ZLFileImage.h> + +class DocFloatImageReader { + +public: + struct BlipStoreEntry { // see p.68 [MS-ODRAW] + unsigned int size; // size of blip in stream + unsigned int referenceCount; // (cRef) reference count for the the blip + unsigned int offsetInDelay; // foDelay, file offset in the delay stream + }; + + struct Blip { //see p.59, p63-66 [MS-ODRAW] + BlipStoreEntry storeEntry; + unsigned int type; + ZLFileImage::Blocks blocks; + }; + + struct FSP { //see p.76-77 [MS-ODRAW] + unsigned int shapeId; //spid + }; + + struct FOPTE { //see p.98 and p.32 [MS-ODRAW] + unsigned int pId; //pid + bool isBlipId; //fBid + bool isComplex; //fComplex + unsigned int value; //op + }; + + struct FSPContainer { //see p.53-55 [MS-ODRAW] + FSP fsp; + std::vector<FOPTE> fopte; + }; + + struct OfficeArtContent { //see p.405-406 [MS-DOC] + std::vector<Blip> blips; //retrieved from OfficeArtDggContainer + std::vector<FSPContainer> FSPs; //retrieved from OfficeArtDgContainer + }; + + struct RecordHeader { //see p.26 [MS-ODRAW] + unsigned int version; + unsigned int instance; + unsigned int type; + unsigned int length; + }; + +public: + DocFloatImageReader(unsigned int off, unsigned int len, shared_ptr<OleStream> tableStream, shared_ptr<OleStream> mainStream); + +public: + void readAll(); + + ZLFileImage::Blocks getBlocksForShapeId(unsigned int shapeId) const; + +private: + static unsigned int readRecordHeader(RecordHeader &header, shared_ptr<OleStream> stream); + static unsigned int readDggContainer(OfficeArtContent &item, unsigned int length, shared_ptr<OleStream> stream, shared_ptr<OleStream> mainStream); + + static unsigned int readBStoreContainer(OfficeArtContent &item, unsigned int length, shared_ptr<OleStream> stream, shared_ptr<OleStream> mainStream); + static unsigned int readBStoreContainerFileBlock(Blip &blip, shared_ptr<OleStream> stream, shared_ptr<OleStream> mainStream); + static unsigned int readBlip(Blip &blip, const RecordHeader &header, shared_ptr<OleStream> stream); + static unsigned int readFBSE(BlipStoreEntry &fbse, shared_ptr<OleStream> stream); + + static unsigned int readFOPTE(FOPTE &fopte, shared_ptr<OleStream> stream); + static unsigned int readArrayFOPTE(std::vector<FOPTE> &fopte, unsigned int length, shared_ptr<OleStream> stream); + static unsigned int readFSP(FSP &fsp, shared_ptr<OleStream> stream); + static unsigned int readSpContainter(FSPContainer &item, unsigned int length, shared_ptr<OleStream> stream); + static unsigned int readSpgrContainer(OfficeArtContent &item, unsigned int length, shared_ptr<OleStream> stream); + static unsigned int readDgContainer(OfficeArtContent &item, unsigned int length, shared_ptr<OleStream> stream); + + static unsigned int skipRecord(const RecordHeader &header, shared_ptr<OleStream> stream); + + static unsigned int read1Byte(shared_ptr<OleStream> stream); + static unsigned int read2Bytes(shared_ptr<OleStream> stream); + static unsigned int read4Bytes(shared_ptr<OleStream> stream); + +private: + shared_ptr<OleStream> myTableStream; + shared_ptr<OleStream> myMainStream; + unsigned int myOffset; + unsigned int myLength; + + OfficeArtContent myItem; +}; + +#endif /* __DOCFLOATIMAGEREADER_H__ */ diff --git a/fbreader/src/formats/doc/DocInlineImageReader.cpp b/fbreader/src/formats/doc/DocInlineImageReader.cpp new file mode 100644 index 0000000..69ce74f --- /dev/null +++ b/fbreader/src/formats/doc/DocInlineImageReader.cpp @@ -0,0 +1,148 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include "OleUtil.h" +#include "OleMainStream.h" + +#include "DocInlineImageReader.h" + +DocInlineImageReader::DocInlineImageReader(shared_ptr<OleStream> dataStream) : + myDataStream(dataStream) { +} + +ZLFileImage::Blocks DocInlineImageReader::getImagePieceInfo(unsigned int dataPos) { + if (myDataStream.isNull()) { + return ZLFileImage::Blocks(); + } + if (!myDataStream->seek(dataPos, true)) { + return ZLFileImage::Blocks(); + } + + //reading PICF structure (see p. 421 [MS-DOC]) + unsigned int picfHeaderSize = 4 + 2 + 8; //record length, headerLength and storage format + char headerBuffer[picfHeaderSize]; + if (myDataStream->read(headerBuffer, picfHeaderSize) != picfHeaderSize) { + return ZLFileImage::Blocks(); + } + unsigned int length = OleUtil::getU4Bytes(headerBuffer, 0); + unsigned int headerLength = OleUtil::getU2Bytes(headerBuffer, 4); + unsigned int formatType = OleUtil::getU2Bytes(headerBuffer, 6); + + if (formatType != 0x0064) { //external link to some file; see p.394 [MS-DOC] + //TODO implement + return ZLFileImage::Blocks(); + } + if (headerLength >= length) { + return ZLFileImage::Blocks(); + } + + //reading OfficeArtInlineSpContainer structure; see p.421 [MS-DOC] and p.56 [MS-ODRAW] + if (!myDataStream->seek(headerLength - picfHeaderSize, false)) { //skip header + return ZLFileImage::Blocks(); + } + + char buffer[8]; //for OfficeArtRecordHeader structure; see p.69 [MS-ODRAW] + bool found = false; + unsigned int curOffset = 0; + for (curOffset = headerLength; !found && curOffset + 8 <= length; curOffset += 8) { + if (myDataStream->read(buffer, 8) != 8) { + return ZLFileImage::Blocks(); + } + unsigned int recordInstance = OleUtil::getU2Bytes(buffer, 0) >> 4; + unsigned int recordType = OleUtil::getU2Bytes(buffer, 2); + unsigned int recordLen = OleUtil::getU4Bytes(buffer, 4); + + switch (recordType) { + case 0xF000: case 0xF001: case 0xF002: case 0xF003: case 0xF004: case 0xF005: + break; + case 0xF007: + { + myDataStream->seek(33, false); + char tmpBuf[1]; + myDataStream->read(tmpBuf, 1); + unsigned int nameLength = OleUtil::getU1Byte(tmpBuf, 0); + myDataStream->seek(nameLength * 2 + 2, false); + curOffset += 33 + 1 + nameLength * 2 + 2; + } + break; + case 0xF008: + myDataStream->seek(8, false); + curOffset += 8; + break; + case 0xF009: + myDataStream->seek(16, false); + curOffset += 16; + break; + case 0xF006: case 0xF00A: case 0xF00B: case 0xF00D: case 0xF00E: case 0xF00F: case 0xF010: case 0xF011: case 0xF122: + myDataStream->seek(recordLen, false); + curOffset += recordLen; + break; + case OleMainStream::IMAGE_EMF: + case OleMainStream::IMAGE_WMF: + case OleMainStream::IMAGE_PICT: + //TODO implement + return ZLFileImage::Blocks(); + case OleMainStream::IMAGE_JPEG: + case OleMainStream::IMAGE_JPEG2: + myDataStream->seek(17, false); + curOffset += 17; + if (recordInstance == 0x46B || recordInstance == 0x6E3) { + myDataStream->seek(16, false); + curOffset += 16; + } + found = true; + break; + case OleMainStream::IMAGE_PNG: + myDataStream->seek(17, false); + curOffset += 17; + if (recordInstance == 0x6E1) { + myDataStream->seek(16, false); + curOffset += 16; + } + found = true; + break; + case OleMainStream::IMAGE_DIB: // DIB = BMP without 14-bytes header + myDataStream->seek(17, false); + curOffset += 17; + if (recordInstance == 0x7A9) { + myDataStream->seek(16, false); + curOffset += 16; + } + found = true; + break; + case OleMainStream::IMAGE_TIFF: + myDataStream->seek(17, false); + curOffset += 17; + if (recordInstance == 0x6E5) { + myDataStream->seek(16, false); + curOffset += 16; + } + found = true; + break; + case 0xF00C: + default: + return ZLFileImage::Blocks(); + } + } + + if (!found) { + return ZLFileImage::Blocks(); + } + return myDataStream->getBlockPieceInfoList(dataPos + curOffset, length - curOffset); +} diff --git a/fbreader/src/formats/doc/DocInlineImageReader.h b/fbreader/src/formats/doc/DocInlineImageReader.h new file mode 100644 index 0000000..9dab9ae --- /dev/null +++ b/fbreader/src/formats/doc/DocInlineImageReader.h @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __DOCINLINEIMAGEREADER_H__ +#define __DOCINLINEIMAGEREADER_H__ + +#include <vector> + +#include "OleStream.h" + +class DocInlineImageReader { + +public: + DocInlineImageReader(shared_ptr<OleStream> dataStream); + ZLFileImage::Blocks getImagePieceInfo(unsigned int dataPos); + +private: + shared_ptr<OleStream> myDataStream; +}; + +#endif /* __DOCINLINEIMAGEREADER_H__ */ diff --git a/fbreader/src/formats/doc/DocMetaInfoReader.cpp b/fbreader/src/formats/doc/DocMetaInfoReader.cpp new file mode 100644 index 0000000..37b39c2 --- /dev/null +++ b/fbreader/src/formats/doc/DocMetaInfoReader.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLInputStream.h> + +#include "../../library/Book.h" + +#include "DocMetaInfoReader.h" + +DocMetaInfoReader::DocMetaInfoReader(Book &book) : myBook(book) { + myBook.removeAllAuthors(); + myBook.setTitle(std::string()); + myBook.setLanguage(std::string()); + myBook.removeAllTags(); +} + +bool DocMetaInfoReader::readMetaInfo() { + myBook.removeAllAuthors(); + myBook.setTitle(myBook.file().name(true)); + myBook.removeAllTags(); + return true; +} diff --git a/fbreader/src/formats/doc/DocMetaInfoReader.h b/fbreader/src/formats/doc/DocMetaInfoReader.h new file mode 100644 index 0000000..db26d29 --- /dev/null +++ b/fbreader/src/formats/doc/DocMetaInfoReader.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __DOCMETAINFOREADER_H__ +#define __DOCMETAINFOREADER_H__ + +#include <string> + +class Book; + +class DocMetaInfoReader { + +public: + DocMetaInfoReader(Book &book); + ~DocMetaInfoReader(); + bool readMetaInfo(); + + /* + void startElementHandler(int tag, const char **attributes); + void endElementHandler(int tag); + void characterDataHandler(const char *text, std::size_t len); + */ + +private: + Book &myBook; +}; + +inline DocMetaInfoReader::~DocMetaInfoReader() {} + +#endif /* __DOCMETAINFOREADER_H__ */ diff --git a/fbreader/src/formats/doc/DocPlugin.cpp b/fbreader/src/formats/doc/DocPlugin.cpp new file mode 100644 index 0000000..ef6f511 --- /dev/null +++ b/fbreader/src/formats/doc/DocPlugin.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLFile.h> +#include <ZLInputStream.h> +#include <ZLLogger.h> +#include <ZLImage.h> +#include <ZLEncodingConverter.h> + +#include "DocPlugin.h" +#include "DocMetaInfoReader.h" +#include "DocBookReader.h" +#include "DocStreams.h" +#include "../../bookmodel/BookModel.h" +#include "../../library/Book.h" + +DocPlugin::DocPlugin() { +} + +DocPlugin::~DocPlugin() { +} + +bool DocPlugin::providesMetaInfo() const { + return true; +} + +const std::string DocPlugin::supportedFileType() const { + return "doc"; +} + +bool DocPlugin::acceptsFile(const ZLFile &file) const { + return file.extension() == "doc"; +} + +bool DocPlugin::readMetaInfo(Book &book) const { + if (!DocMetaInfoReader(book).readMetaInfo()) { + return false; + } + + shared_ptr<ZLInputStream> stream = new DocAnsiStream(book.file(), 50000); + if (!detectEncodingAndLanguage(book, *stream)) { + stream = new DocUcs2Stream(book.file(), 50000); + detectLanguage(book, *stream, ZLEncodingConverter::UTF8, true); + } + + return true; +} + +bool DocPlugin::readLanguageAndEncoding(Book &/*book*/) const { + return true; +} + +bool DocPlugin::readModel(BookModel &model) const { + return DocBookReader(model, model.book()->encoding()).readBook(); +} diff --git a/fbreader/src/formats/doc/DocPlugin.h b/fbreader/src/formats/doc/DocPlugin.h new file mode 100644 index 0000000..93b1803 --- /dev/null +++ b/fbreader/src/formats/doc/DocPlugin.h @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __DOCPLUGIN_H__ +#define __DOCPLUGIN_H__ + +#include "../FormatPlugin.h" + +class DocPlugin : public FormatPlugin { + +public: + DocPlugin(); + ~DocPlugin(); + bool providesMetaInfo() const; + + const std::string supportedFileType() const; + bool acceptsFile(const ZLFile &file) const; + bool readMetaInfo(Book &book) const; + bool readLanguageAndEncoding(Book &book) const; + bool readModel(BookModel &model) const; +}; + +#endif /* __DOCPLUGIN_H__ */ diff --git a/fbreader/src/formats/doc/DocStreams.cpp b/fbreader/src/formats/doc/DocStreams.cpp new file mode 100644 index 0000000..b21e15a --- /dev/null +++ b/fbreader/src/formats/doc/DocStreams.cpp @@ -0,0 +1,202 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstring> +#include <cstdlib> +#include <string> + +#include "DocStreams.h" +#include "OleStreamReader.h" + +class DocReader : public OleStreamReader { + +public: + DocReader(char *buffer, std::size_t maxSize); + ~DocReader(); + std::size_t readSize() const; + +private: + bool readStream(OleMainStream &stream); + void ansiDataHandler(const char *buffer, std::size_t len); + void ucs2SymbolHandler(ZLUnicodeUtil::Ucs2Char symbol); + void footnotesStartHandler(); + +protected: + char *myBuffer; + const std::size_t myMaxSize; + std::size_t myActualSize; +}; + +class DocAnsiReader : public DocReader { + +public: + DocAnsiReader(char *buffer, std::size_t maxSize); + ~DocAnsiReader(); + +private: + void ansiDataHandler(const char *buffer, std::size_t len); +}; + +class DocUcs2Reader : public DocReader { + +public: + DocUcs2Reader(char *buffer, std::size_t maxSize); + ~DocUcs2Reader(); + +private: + void ucs2SymbolHandler(ZLUnicodeUtil::Ucs2Char symbol); +}; + +DocReader::DocReader(char *buffer, std::size_t maxSize) : myBuffer(buffer), myMaxSize(maxSize), myActualSize(0) { +} + +DocReader::~DocReader() { +} + +bool DocReader::readStream(OleMainStream &stream) { + // TODO make 2 optmizations: + // 1) If another piece is too big, reading of next piece can be stopped if some size parameter will be specified + // (it can be transfered as a parameter (with default 0 value, that means no need to use it) to readNextPiece method) + // 2) We can specify as a parameter for readNextPiece, what kind of piece should be read next (ANSI or not ANSI). + // As type of piece is known already, there's no necessary to read other pieces. + while (myActualSize < myMaxSize) { + if (!readNextPiece(stream)) { + break; + } + } + return true; +} + +void DocReader::ansiDataHandler(const char*, std::size_t) { +} + +void DocReader::ucs2SymbolHandler(ZLUnicodeUtil::Ucs2Char) { +} + +void DocReader::footnotesStartHandler() { +} + +std::size_t DocReader::readSize() const { + return myActualSize; +} + +DocAnsiReader::DocAnsiReader(char *buffer, std::size_t maxSize) : DocReader(buffer, maxSize) { +} + +DocAnsiReader::~DocAnsiReader() { +} + +void DocAnsiReader::ansiDataHandler(const char *buffer, std::size_t dataLength) { + if (myActualSize < myMaxSize) { + const std::size_t len = std::min(dataLength, myMaxSize - myActualSize); + std::strncpy(myBuffer + myActualSize, buffer, len); + myActualSize += len; + } +} + +DocUcs2Reader::DocUcs2Reader(char *buffer, std::size_t maxSize) : DocReader(buffer, maxSize) { +} + +DocUcs2Reader::~DocUcs2Reader() { +} + +void DocUcs2Reader::ucs2SymbolHandler(ZLUnicodeUtil::Ucs2Char symbol) { + if (myActualSize < myMaxSize) { + char buffer[4]; + const std::size_t dataLength = ZLUnicodeUtil::ucs2ToUtf8(buffer, symbol); + const std::size_t len = std::min(dataLength, myMaxSize - myActualSize); + std::strncpy(myBuffer + myActualSize, buffer, len); + myActualSize += len; + } +} + +DocStream::DocStream(const ZLFile& file, std::size_t maxSize) : myFile(file), myBuffer(0), mySize(maxSize) { +} + +DocStream::~DocStream() { + close(); +} + +bool DocStream::open() { + if (mySize != 0) { + myBuffer = new char[mySize]; + } + shared_ptr<DocReader> reader = createReader(myBuffer, mySize); + shared_ptr<ZLInputStream> stream = myFile.inputStream(); + if (stream.isNull() || !stream->open()) { + return false; + } + if (!reader->readDocument(stream, false)) { + return false; + } + mySize = reader->readSize(); + myOffset = 0; + return true; +} + +std::size_t DocStream::read(char *buffer, std::size_t maxSize) { + maxSize = std::min(maxSize, mySize - myOffset); + if (buffer != 0 && myBuffer != 0) { + std::memcpy(buffer, myBuffer + myOffset, maxSize); + } + myOffset += maxSize; + return maxSize; +} + +void DocStream::close() { + if (myBuffer != 0) { + delete[] myBuffer; + myBuffer = 0; + } +} + +void DocStream::seek(int offset, bool absoluteOffset) { + if (!absoluteOffset) { + offset += myOffset; + } + myOffset = std::min(mySize, (std::size_t)std::max(0, offset)); +} + +std::size_t DocStream::offset() const { + return myOffset; +} + +std::size_t DocStream::sizeOfOpened() { + return mySize; +} + +DocAnsiStream::DocAnsiStream(const ZLFile& file, std::size_t maxSize) : DocStream(file, maxSize) { +} + +DocAnsiStream::~DocAnsiStream() { +} + +shared_ptr<DocReader> DocAnsiStream::createReader(char *buffer, std::size_t maxSize) { + return new DocAnsiReader(buffer, maxSize); +} + +DocUcs2Stream::DocUcs2Stream(const ZLFile& file, std::size_t maxSize) : DocStream(file, maxSize) { +} + +DocUcs2Stream::~DocUcs2Stream() { +} + +shared_ptr<DocReader> DocUcs2Stream::createReader(char *buffer, std::size_t maxSize) { + return new DocUcs2Reader(buffer, maxSize); +} diff --git a/fbreader/src/formats/doc/DocStreams.h b/fbreader/src/formats/doc/DocStreams.h new file mode 100644 index 0000000..4b1538a --- /dev/null +++ b/fbreader/src/formats/doc/DocStreams.h @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2008-2012 Geometer Plus <contact@geometerplus.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __DOCSTREAMS_H__ +#define __DOCSTREAMS_H__ + +#include <ZLFile.h> +#include <ZLInputStream.h> + +class DocReader; + +class DocStream : public ZLInputStream { + +public: + DocStream(const ZLFile& file, std::size_t maxSize); + ~DocStream(); + +private: + bool open(); + std::size_t read(char *buffer, std::size_t maxSize); + void close(); + + void seek(int offset, bool absoluteOffset); + std::size_t offset() const; + std::size_t sizeOfOpened(); + +protected: + virtual shared_ptr<DocReader> createReader(char *buffer, std::size_t maxSize) = 0; + +private: + const ZLFile myFile; + char *myBuffer; + std::size_t mySize; + std::size_t myOffset; +}; + +class DocAnsiStream : public DocStream { + +public: + DocAnsiStream(const ZLFile& file, std::size_t maxSize); + ~DocAnsiStream(); + +private: + shared_ptr<DocReader> createReader(char *buffer, std::size_t maxSize); +}; + +class DocUcs2Stream : public DocStream { + +public: + DocUcs2Stream(const ZLFile& file, std::size_t maxSize); + ~DocUcs2Stream(); + +private: + shared_ptr<DocReader> createReader(char *buffer, std::size_t maxSize); +}; + +#endif /* __DOCSTREAMS_H__ */ diff --git a/fbreader/src/formats/doc/OleMainStream.cpp b/fbreader/src/formats/doc/OleMainStream.cpp new file mode 100644 index 0000000..fe829e6 --- /dev/null +++ b/fbreader/src/formats/doc/OleMainStream.cpp @@ -0,0 +1,1085 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <string> + +#include <ZLLogger.h> +#include <ZLUnicodeUtil.h> + +#include "OleUtil.h" +#include "OleStorage.h" + +#include "DocInlineImageReader.h" + +#include "OleMainStream.h" + +OleMainStream::Style::Style() : + StyleIdCurrent(STYLE_INVALID), + StyleIdNext(STYLE_INVALID), + HasPageBreakBefore(false), + BeforeParagraphIndent(0), + AfterParagraphIndent(0), + LeftIndent(0), + FirstLineIndent(0), + RightIndent(0), + Alignment(ALIGNMENT_DEFAULT) { +} + +OleMainStream::CharInfo::CharInfo() : FontStyle(FONT_REGULAR), FontSize(20) { +} + +OleMainStream::SectionInfo::SectionInfo() : CharPosition(0), IsNewPage(true) { +} + +OleMainStream::InlineImageInfo::InlineImageInfo() : DataPosition(0) { +} + +OleMainStream::FloatImageInfo::FloatImageInfo() : ShapeId(0) { +} + +OleMainStream::OleMainStream(shared_ptr<OleStorage> storage, OleEntry oleEntry, shared_ptr<ZLInputStream> stream) : OleStream(storage, oleEntry, stream) { +} + +bool OleMainStream::open(bool doReadFormattingData) { + if (OleStream::open() == false) { + return false; + } + + static const std::size_t HEADER_SIZE = 768; //size of data in header of main stream + char headerBuffer[HEADER_SIZE]; + seek(0, true); + + if (read(headerBuffer, HEADER_SIZE) != HEADER_SIZE) { + return false; + } + + bool result = readFIB(headerBuffer); + if (!result) { + return false; + } + + // determining table stream number + unsigned int tableNumber = (OleUtil::getU2Bytes(headerBuffer, 0xA) & 0x0200) ? 1 : 0; + std::string tableName = tableNumber == 0 ? "0" : "1"; + tableName += "Table"; + OleEntry tableEntry; + result = myStorage->getEntryByName(tableName, tableEntry); + + if (!result) { + // cant't find table stream (that can be only in case if file format is below Word 7/8), so building simple table stream + // TODO: CHECK may be not all old documents have ANSI + ZLLogger::Instance().println("DocPlugin", "cant't find table stream, building own simple piece table, that includes all charachters"); + Piece piece = {myStartOfText, myEndOfText - myStartOfText, true, Piece::PIECE_TEXT, 0}; + myPieces.push_back(piece); + return true; + } + + result = readPieceTable(headerBuffer, tableEntry); + + if (!result) { + ZLLogger::Instance().println("DocPlugin", "error during reading piece table"); + return false; + } + + if (!doReadFormattingData) { + return true; + } + + OleEntry dataEntry; + if (myStorage->getEntryByName("Data", dataEntry)) { + myDataStream = new OleStream(myStorage, dataEntry, myBaseStream); + } + + //result of reading following structures doesn't check, because all these + //problems can be ignored, and document can be showed anyway, maybe with wrong formatting + readBookmarks(headerBuffer, tableEntry); + readStylesheet(headerBuffer, tableEntry); + //readSectionsInfoTable(headerBuffer, tableEntry); //it isn't used now + readParagraphStyleTable(headerBuffer, tableEntry); + readCharInfoTable(headerBuffer, tableEntry); + readFloatingImages(headerBuffer, tableEntry); + return true; +} + +const OleMainStream::Pieces &OleMainStream::getPieces() const { + return myPieces; +} + +const OleMainStream::CharInfoList &OleMainStream::getCharInfoList() const { + return myCharInfoList; +} + +const OleMainStream::StyleInfoList &OleMainStream::getStyleInfoList() const { + return myStyleInfoList; +} + +const OleMainStream::BookmarksList &OleMainStream::getBookmarks() const { + return myBookmarks; +} + +const OleMainStream::InlineImageInfoList &OleMainStream::getInlineImageInfoList() const { + return myInlineImageInfoList; +} + +const OleMainStream::FloatImageInfoList &OleMainStream::getFloatImageInfoList() const { + return myFloatImageInfoList; +} + +ZLFileImage::Blocks OleMainStream::getFloatImage(unsigned int shapeId) const { + if (myFLoatImageReader.isNull()) { + return ZLFileImage::Blocks(); + } + return myFLoatImageReader->getBlocksForShapeId(shapeId); +} + +ZLFileImage::Blocks OleMainStream::getInlineImage(unsigned int dataPosition) const { + if (myDataStream.isNull()) { + return ZLFileImage::Blocks(); + } + DocInlineImageReader imageReader(myDataStream); + return imageReader.getImagePieceInfo(dataPosition); +} + +bool OleMainStream::readFIB(const char *headerBuffer) { + int flags = OleUtil::getU2Bytes(headerBuffer, 0xA); //offset for flags + + if (flags & 0x0004) { //flag for complex format + ZLLogger::Instance().println("DocPlugin", "This was fast-saved. Some information is lost"); + //lostInfo = (flags & 0xF0) >> 4); + } + + if (flags & 0x1000) { //flag for using extending charset + ZLLogger::Instance().println("DocPlugin", "File uses extended character set (get_word8_char)"); + } else { + ZLLogger::Instance().println("DocPlugin", "File uses get_8bit_char character set"); + } + + if (flags & 0x100) { //flag for encrypted files + ZLLogger::Instance().println("DocPlugin", "File is encrypted"); + // Encryption key = %08lx ; NumUtil::get4Bytes(header, 14) + return false; + } + + unsigned int charset = OleUtil::getU2Bytes(headerBuffer, 0x14); //offset for charset number + if (charset && charset != 0x100) { //0x100 = default charset + ZLLogger::Instance().println("DocPlugin", "Using not default character set %d"); + } else { + ZLLogger::Instance().println("DocPlugin", "Using default character set"); + } + + myStartOfText = OleUtil::get4Bytes(headerBuffer, 0x18); //offset for start of text value + myEndOfText = OleUtil::get4Bytes(headerBuffer, 0x1c); //offset for end of text value + return true; +} + +void OleMainStream::splitPieces(const Pieces &s, Pieces &dest1, Pieces &dest2, Piece::PieceType type1, Piece::PieceType type2, int boundary) { + Pieces source = s; + dest1.clear(); + dest2.clear(); + + int sumLength = 0; + std::size_t i = 0; + for (i = 0; i < source.size(); ++i) { + Piece piece = source.at(i); + if (piece.Length + sumLength >= boundary) { + Piece piece2 = piece; + + piece.Length = boundary - sumLength; + piece.Type = type1; + + piece2.Type = type2; + piece2.Offset += piece.Length * 2; + piece2.Length -= piece.Length; + + if (piece.Length > 0) { + dest1.push_back(piece); + } + if (piece2.Length > 0) { + dest2.push_back(piece2); + } + ++i; + break; + } + sumLength += piece.Length; + piece.Type = type1; + dest1.push_back(piece); + } + for (; i < source.size(); ++i) { + Piece piece = source.at(i); + piece.Type = type2; + dest2.push_back(piece); + } + +} + +std::string OleMainStream::getPiecesTableBuffer(const char *headerBuffer, OleStream &tableStream) { + unsigned int clxOffset = OleUtil::getU4Bytes(headerBuffer, 0x01A2); //offset for CLX structure + unsigned int clxLength = OleUtil::getU4Bytes(headerBuffer, 0x01A6); //offset for value of CLX structure length + + //1 step : loading CLX table from table stream + char *clxBuffer = new char[clxLength]; + if (!tableStream.seek(clxOffset, true)) { + ZLLogger::Instance().println("DocPlugin", "getPiecesTableBuffer -- error for seeking to CLX structure"); + return std::string(); + } + if (tableStream.read(clxBuffer, clxLength) != clxLength) { + ZLLogger::Instance().println("DocPlugin", "getPiecesTableBuffer -- CLX structure length is invalid"); + return std::string(); + } + std::string clx(clxBuffer, clxLength); + delete[] clxBuffer; + + //2 step: searching for pieces table buffer at CLX + //(determines it by 0x02 as start symbol) + std::size_t from = 0; + std::size_t i; + std::string pieceTableBuffer; + while ((i = clx.find_first_of(0x02, from)) != std::string::npos) { + if (clx.size() < i + 1 + 4) { + ZLLogger::Instance().println("DocPlugin", "getPiecesTableBuffer -- CLX structure has invalid format"); + return std::string(); + } + unsigned int pieceTableLength = OleUtil::getU4Bytes(clx.c_str(), i + 1); + pieceTableBuffer = std::string(clx, i + 1 + 4); + if (pieceTableBuffer.length() != pieceTableLength) { + from = i + 1; + continue; + } + break; + } + return pieceTableBuffer; +} + + +bool OleMainStream::readPieceTable(const char *headerBuffer, const OleEntry &tableEntry) { + OleStream tableStream(myStorage, tableEntry, myBaseStream); + std::string piecesTableBuffer = getPiecesTableBuffer(headerBuffer, tableStream); + + if (piecesTableBuffer.empty()) { + return false; + } + + //getting count of Character Positions for different types of subdocuments in Main Stream + int ccpText = OleUtil::get4Bytes(headerBuffer, 0x004C); //text + int ccpFtn = OleUtil::get4Bytes(headerBuffer, 0x0050); //footnote subdocument + int ccpHdd = OleUtil::get4Bytes(headerBuffer, 0x0054); //header subdocument + int ccpMcr = OleUtil::get4Bytes(headerBuffer, 0x0058); //macro subdocument + int ccpAtn = OleUtil::get4Bytes(headerBuffer, 0x005C); //comment subdocument + int ccpEdn = OleUtil::get4Bytes(headerBuffer, 0x0060); //endnote subdocument + int ccpTxbx = OleUtil::get4Bytes(headerBuffer, 0x0064); //textbox subdocument + int ccpHdrTxbx = OleUtil::get4Bytes(headerBuffer, 0x0068); //textbox subdocument of the header + int lastCP = ccpFtn + ccpHdd + ccpMcr + ccpAtn + ccpEdn + ccpTxbx + ccpHdrTxbx; + if (lastCP != 0) { + ++lastCP; + } + lastCP += ccpText; + + //getting the CP (character positions) and CP descriptors + std::vector<int> cp; //array of character positions for pieces + unsigned int j = 0; + for (j = 0; ; j += 4) { + if (piecesTableBuffer.size() < j + 4) { + ZLLogger::Instance().println("DocPlugin", "invalid piece table, cp ends not with a lastcp"); + break; + } + int curCP = OleUtil::get4Bytes(piecesTableBuffer.c_str(), j); + cp.push_back(curCP); + if (curCP == lastCP) { + break; + } + } + + if (cp.size() < 2) { + ZLLogger::Instance().println("DocPlugin", "invalid piece table, < 2 pieces"); + return false; + } + + std::vector<std::string> descriptors; + for (std::size_t k = 0; k < cp.size() - 1; ++k) { + //j + 4, because it should be taken after CP in PiecesTable Buffer + //k * 8, because it should be taken 8 byte for each descriptor + std::size_t substrFrom = j + 4 + k * 8; + if (piecesTableBuffer.size() < substrFrom + 8) { + ZLLogger::Instance().println("DocPlugin", "invalid piece table, problems with descriptors reading"); + break; + } + descriptors.push_back(piecesTableBuffer.substr(substrFrom, 8)); + } + + //filling the Pieces vector + std::size_t minValidSize = std::min(cp.size() - 1, descriptors.size()); + if (minValidSize == 0) { + ZLLogger::Instance().println("DocPlugin", "invalid piece table, there are no pieces"); + return false; + } + + for (std::size_t i = 0; i < minValidSize; ++i) { + //4byte integer with offset and ANSI flag + int fcValue = OleUtil::get4Bytes(descriptors.at(i).c_str(), 0x2); //offset for piece structure + Piece piece; + piece.IsANSI = (fcValue & 0x40000000) == 0x40000000; //ansi flag + piece.Offset = fcValue & 0x3FFFFFFF; //gettting offset for current piece + piece.Length = cp.at(i + 1) - cp.at(i); + myPieces.push_back(piece); + } + + //split pieces into different types + Pieces piecesText, piecesFootnote, piecesOther; + splitPieces(myPieces, piecesText, piecesFootnote, Piece::PIECE_TEXT, Piece::PIECE_FOOTNOTE, ccpText); + splitPieces(piecesFootnote, piecesFootnote, piecesOther, Piece::PIECE_FOOTNOTE, Piece::PIECE_OTHER, ccpFtn); + + myPieces.clear(); + for (std::size_t i = 0; i < piecesText.size(); ++i) { + myPieces.push_back(piecesText.at(i)); + } + for (std::size_t i = 0; i < piecesFootnote.size(); ++i) { + myPieces.push_back(piecesFootnote.at(i)); + } + for (std::size_t i = 0; i < piecesOther.size(); ++i) { + myPieces.push_back(piecesOther.at(i)); + } + + //converting length and offset depending on isANSI + for (std::size_t i = 0; i < myPieces.size(); ++i) { + Piece &piece = myPieces.at(i); + if (!piece.IsANSI) { + piece.Length *= 2; + } else { + piece.Offset /= 2; + } + } + + //filling startCP field + unsigned int curStartCP = 0; + for (std::size_t i = 0; i < myPieces.size(); ++i) { + Piece &piece = myPieces.at(i); + piece.startCP = curStartCP; + if (piece.IsANSI) { + curStartCP += piece.Length; + } else { + curStartCP += piece.Length / 2; + } + } + return true; +} + +bool OleMainStream::readBookmarks(const char *headerBuffer, const OleEntry &tableEntry) { + //SttbfBkmk structure is a table of bookmark name strings + unsigned int beginNamesInfo = OleUtil::getU4Bytes(headerBuffer, 0x142); // address of SttbfBkmk structure + std::size_t namesInfoLength = (std::size_t)OleUtil::getU4Bytes(headerBuffer, 0x146); // length of SttbfBkmk structure + + if (namesInfoLength == 0) { + return true; //there's no bookmarks + } + + OleStream tableStream(myStorage, tableEntry, myBaseStream); + std::string buffer; + if (!readToBuffer(buffer, beginNamesInfo, namesInfoLength, tableStream)) { + return false; + } + + unsigned int recordsNumber = OleUtil::getU2Bytes(buffer.c_str(), 0x2); //count of records + + std::vector<std::string> names; + unsigned int offset = 0x6; //initial offset + for (unsigned int i = 0; i < recordsNumber; ++i) { + if (buffer.size() < offset + 2) { + ZLLogger::Instance().println("DocPlugin", "problmes with reading bookmarks names"); + break; + } + unsigned int length = OleUtil::getU2Bytes(buffer.c_str(), offset) * 2; //length of string in bytes + ZLUnicodeUtil::Ucs2String name; + for (unsigned int j = 0; j < length; j+=2) { + char ch1 = buffer.at(offset + 2 + j); + char ch2 = buffer.at(offset + 2 + j + 1); + ZLUnicodeUtil::Ucs2Char ucs2Char = (unsigned int)ch1 | ((unsigned int)ch2 << 8); + name.push_back(ucs2Char); + } + std::string utf8Name; + ZLUnicodeUtil::ucs2ToUtf8(utf8Name, name); + names.push_back(utf8Name); + offset += length + 2; + } + + //plcfBkmkf structure is table recording beginning CPs of bookmarks + unsigned int beginCharPosInfo = OleUtil::getU4Bytes(headerBuffer, 0x14A); // address of plcfBkmkf structure + std::size_t charPosInfoLen = (std::size_t)OleUtil::getU4Bytes(headerBuffer, 0x14E); // length of plcfBkmkf structure + + if (charPosInfoLen == 0) { + return true; //there's no bookmarks + } + + if (!readToBuffer(buffer, beginCharPosInfo, charPosInfoLen, tableStream)) { + return false; + } + + static const unsigned int BKF_SIZE = 4; + std::size_t size = calcCountOfPLC(charPosInfoLen, BKF_SIZE); + std::vector<unsigned int> charPage; + for (std::size_t index = 0, offset = 0; index < size; ++index, offset += 4) { + charPage.push_back(OleUtil::getU4Bytes(buffer.c_str(), offset)); + } + + for (std::size_t i = 0; i < names.size(); ++i) { + if (i >= charPage.size()) { + break; //for the case if something in these structures goes wrong, to not to lose all bookmarks + } + Bookmark bookmark; + bookmark.CharPosition = charPage.at(i); + bookmark.Name = names.at(i); + myBookmarks.push_back(bookmark); + } + + return true; +} + +bool OleMainStream::readStylesheet(const char *headerBuffer, const OleEntry &tableEntry) { + //STSH structure is a stylesheet + unsigned int beginStshInfo = OleUtil::getU4Bytes(headerBuffer, 0xa2); // address of STSH structure + std::size_t stshInfoLength = (std::size_t)OleUtil::getU4Bytes(headerBuffer, 0xa6); // length of STSH structure + + OleStream tableStream(myStorage, tableEntry, myBaseStream); + char *buffer = new char[stshInfoLength]; + if (!tableStream.seek(beginStshInfo, true)) { + ZLLogger::Instance().println("DocPlugin", "problems with reading STSH structure"); + return false; + } + if (tableStream.read(buffer, stshInfoLength) != stshInfoLength) { + ZLLogger::Instance().println("DocPlugin", "problems with reading STSH structure, invalid length"); + return false; + } + + std::size_t stdCount = (std::size_t)OleUtil::getU2Bytes(buffer, 2); + std::size_t stdBaseInFile = (std::size_t)OleUtil::getU2Bytes(buffer, 4); + myStyleSheet.resize(stdCount); + + std::vector<bool> isFilled; + isFilled.resize(stdCount, false); + + std::size_t stdLen = 0; + bool styleSheetWasChanged = false; + do { //make it in while loop, because some base style can be after their successors + styleSheetWasChanged = false; + for (std::size_t index = 0, offset = 2 + (std::size_t)OleUtil::getU2Bytes(buffer, 0); index < stdCount; index++, offset += 2 + stdLen) { + stdLen = (std::size_t)OleUtil::getU2Bytes(buffer, offset); + if (isFilled.at(index)) { + continue; + } + + if (stdLen == 0) { + //if record is empty, left it default + isFilled[index] = true; + continue; + } + + Style styleInfo = myStyleSheet.at(index); + + const unsigned int styleAndBaseType = OleUtil::getU2Bytes(buffer, offset + 4); + const unsigned int styleType = styleAndBaseType % 16; + const unsigned int baseStyleId = styleAndBaseType / 16; + if (baseStyleId == Style::STYLE_NIL || baseStyleId == Style::STYLE_USER) { + //if based on nil or user style, left default + } else { + int baseStyleIndex = getStyleIndex(baseStyleId, isFilled, myStyleSheet); + if (baseStyleIndex < 0) { + //this base style is not filled yet, so pass it at some time + continue; + } + styleInfo = myStyleSheet.at(baseStyleIndex); + styleInfo.StyleIdCurrent = Style::STYLE_INVALID; + } + + // parse STD structure + unsigned int tmp = OleUtil::getU2Bytes(buffer, offset + 6); + unsigned int upxCount = tmp % 16; + styleInfo.StyleIdNext = tmp / 16; + + //adding current style + myStyleSheet[index] = styleInfo; + isFilled[index] = true; + styleSheetWasChanged = true; + + std::size_t pos = 2 + stdBaseInFile; + std::size_t nameLen = (std::size_t)OleUtil::getU2Bytes(buffer, offset + pos); + nameLen = nameLen * 2 + 2; //from Unicode characters to bytes + Unicode null charachter length + pos += 2 + nameLen; + if (pos % 2 != 0) { + ++pos; + } + if (pos >= stdLen) { + continue; + } + std::size_t upxLen = (std::size_t)OleUtil::getU2Bytes(buffer, offset + pos); + if (pos + upxLen > stdLen) { + //UPX length too large + continue; + } + //for style info styleType must be equal 1 + if (styleType == 1 && upxCount >= 1) { + if (upxLen >= 2) { + styleInfo.StyleIdCurrent = OleUtil::getU2Bytes(buffer, offset + pos + 2); + getStyleInfo(0, buffer + offset + pos + 4, upxLen - 2, styleInfo); + myStyleSheet[index] = styleInfo; + } + pos += 2 + upxLen; + if (pos % 2 != 0) { + ++pos; + } + upxLen = (std::size_t)OleUtil::getU2Bytes(buffer, offset + pos); + } + if (upxLen == 0 || pos + upxLen > stdLen) { + //too small/too large + continue; + } + //for char info styleType can be equal 1 or 2 + if ((styleType == 1 && upxCount >= 2) || (styleType == 2 && upxCount >= 1)) { + CharInfo charInfo; + getCharInfo(0, Style::STYLE_INVALID, buffer + offset + pos + 2, upxLen, charInfo); + styleInfo.CurrentCharInfo = charInfo; + myStyleSheet[index] = styleInfo; + } + } + } while (styleSheetWasChanged); + delete[] buffer; + return true; +} + +bool OleMainStream::readCharInfoTable(const char *headerBuffer, const OleEntry &tableEntry) { + //PlcfbteChpx structure is table with formatting for particular run of text + unsigned int beginCharInfo = OleUtil::getU4Bytes(headerBuffer, 0xfa); // address of PlcfbteChpx structure + std::size_t charInfoLength = (std::size_t)OleUtil::getU4Bytes(headerBuffer, 0xfe); // length of PlcfbteChpx structure + if (charInfoLength < 4) { + return false; + } + + OleStream tableStream(myStorage, tableEntry, myBaseStream); + std::string buffer; + if (!readToBuffer(buffer, beginCharInfo, charInfoLength, tableStream)) { + return false; + } + + static const unsigned int CHPX_SIZE = 4; + std::size_t size = calcCountOfPLC(charInfoLength, CHPX_SIZE); + std::vector<unsigned int> charBlocks; + for (std::size_t index = 0, offset = (size + 1) * 4; index < size; ++index, offset += CHPX_SIZE) { + charBlocks.push_back(OleUtil::getU4Bytes(buffer.c_str(), offset)); + } + + char *formatPageBuffer = new char[OleStorage::BBD_BLOCK_SIZE]; + for (std::size_t index = 0; index < charBlocks.size(); ++index) { + seek(charBlocks.at(index) * OleStorage::BBD_BLOCK_SIZE, true); + if (read(formatPageBuffer, OleStorage::BBD_BLOCK_SIZE) != OleStorage::BBD_BLOCK_SIZE) { + return false; + } + unsigned int crun = OleUtil::getU1Byte(formatPageBuffer, 0x1ff); //offset with crun (count of 'run of text') + for (unsigned int index2 = 0; index2 < crun; ++index2) { + unsigned int offset = OleUtil::getU4Bytes(formatPageBuffer, index2 * 4); + unsigned int chpxOffset = 2 * OleUtil::getU1Byte(formatPageBuffer, (crun + 1) * 4 + index2); + unsigned int len = OleUtil::getU1Byte(formatPageBuffer, chpxOffset); + unsigned int charPos = 0; + if (!offsetToCharPos(offset, charPos, myPieces)) { + continue; + } + unsigned int styleId = getStyleIdByCharPos(charPos, myStyleInfoList); + + CharInfo charInfo = getStyleFromStylesheet(styleId, myStyleSheet).CurrentCharInfo; + if (chpxOffset != 0) { + getCharInfo(chpxOffset, styleId, formatPageBuffer + 1, len - 1, charInfo); + } + myCharInfoList.push_back(CharPosToCharInfo(charPos, charInfo)); + + if (chpxOffset != 0) { + InlineImageInfo pictureInfo; + if (getInlineImageInfo(chpxOffset, formatPageBuffer + 1, len - 1, pictureInfo)) { + myInlineImageInfoList.push_back(CharPosToInlineImageInfo(charPos, pictureInfo)); + } + } + + } + } + delete[] formatPageBuffer; + return true; +} + +bool OleMainStream::readFloatingImages(const char *headerBuffer, const OleEntry &tableEntry) { + //Plcspa structure is a table with information for FSPA (File Shape Address) + unsigned int beginPicturesInfo = OleUtil::getU4Bytes(headerBuffer, 0x01DA); // address of Plcspa structure + if (beginPicturesInfo == 0) { + return true; //there's no office art objects + } + unsigned int picturesInfoLength = OleUtil::getU4Bytes(headerBuffer, 0x01DE); // length of Plcspa structure + if (picturesInfoLength < 4) { + return false; + } + + OleStream tableStream(myStorage, tableEntry, myBaseStream); + std::string buffer; + if (!readToBuffer(buffer, beginPicturesInfo, picturesInfoLength, tableStream)) { + return false; + } + + static const unsigned int SPA_SIZE = 26; + std::size_t size = calcCountOfPLC(picturesInfoLength, SPA_SIZE); + + std::vector<unsigned int> picturesBlocks; + for (std::size_t index = 0, tOffset = 0; index < size; ++index, tOffset += 4) { + picturesBlocks.push_back(OleUtil::getU4Bytes(buffer.c_str(), tOffset)); + } + + for (std::size_t index = 0, tOffset = (size + 1) * 4; index < size; ++index, tOffset += SPA_SIZE) { + unsigned int spid = OleUtil::getU4Bytes(buffer.c_str(), tOffset); + FloatImageInfo info; + unsigned int charPos = picturesBlocks.at(index); + info.ShapeId = spid; + myFloatImageInfoList.push_back(CharPosToFloatImageInfo(charPos, info)); + } + + //DggInfo structure is office art object table data + unsigned int beginOfficeArtContent = OleUtil::getU4Bytes(headerBuffer, 0x22A); // address of DggInfo structure + if (beginOfficeArtContent == 0) { + return true; //there's no office art objects + } + unsigned int officeArtContentLength = OleUtil::getU4Bytes(headerBuffer, 0x022E); // length of DggInfo structure + if (officeArtContentLength < 4) { + return false; + } + + shared_ptr<OleStream> newTableStream = new OleStream(myStorage, tableEntry, myBaseStream); + shared_ptr<OleStream> newMainStream = new OleStream(myStorage, myOleEntry, myBaseStream); + if (newTableStream->open() && newMainStream->open()) { + myFLoatImageReader = new DocFloatImageReader(beginOfficeArtContent, officeArtContentLength, newTableStream, newMainStream); + myFLoatImageReader->readAll(); + } + return true; +} + +bool OleMainStream::readParagraphStyleTable(const char *headerBuffer, const OleEntry &tableEntry) { + //PlcBtePapx structure is table with formatting for all paragraphs + unsigned int beginParagraphInfo = OleUtil::getU4Bytes(headerBuffer, 0x102); // address of PlcBtePapx structure + std::size_t paragraphInfoLength = (std::size_t)OleUtil::getU4Bytes(headerBuffer, 0x106); // length of PlcBtePapx structure + if (paragraphInfoLength < 4) { + return false; + } + + OleStream tableStream(myStorage, tableEntry, myBaseStream); + std::string buffer; + if (!readToBuffer(buffer, beginParagraphInfo, paragraphInfoLength, tableStream)) { + return false; + } + + static const unsigned int PAPX_SIZE = 4; + std::size_t size = calcCountOfPLC(paragraphInfoLength, PAPX_SIZE); + + std::vector<unsigned int> paragraphBlocks; + for (std::size_t index = 0, tOffset = (size + 1) * 4; index < size; ++index, tOffset += PAPX_SIZE) { + paragraphBlocks.push_back(OleUtil::getU4Bytes(buffer.c_str(), tOffset)); + } + + char *formatPageBuffer = new char[OleStorage::BBD_BLOCK_SIZE]; + for (std::size_t index = 0; index < paragraphBlocks.size(); ++index) { + seek(paragraphBlocks.at(index) * OleStorage::BBD_BLOCK_SIZE, true); + if (read(formatPageBuffer, OleStorage::BBD_BLOCK_SIZE) != OleStorage::BBD_BLOCK_SIZE) { + return false; + } + const unsigned int paragraphsCount = OleUtil::getU1Byte(formatPageBuffer, 0x1ff); //offset with 'cpara' value (count of paragraphs) + for (unsigned int index2 = 0; index2 < paragraphsCount; ++index2) { + const unsigned int offset = OleUtil::getU4Bytes(formatPageBuffer, index2 * 4); + unsigned int papxOffset = OleUtil::getU1Byte(formatPageBuffer, (paragraphsCount + 1) * 4 + index2 * 13) * 2; + if (papxOffset <= 0) { + continue; + } + unsigned int len = OleUtil::getU1Byte(formatPageBuffer, papxOffset) * 2; + if (len == 0) { + ++papxOffset; + len = OleUtil::getU1Byte(formatPageBuffer, papxOffset) * 2; + } + + const unsigned int styleId = OleUtil::getU2Bytes(formatPageBuffer, papxOffset + 1); + Style styleInfo = getStyleFromStylesheet(styleId, myStyleSheet); + + if (len >= 3) { + getStyleInfo(papxOffset, formatPageBuffer + 3, len - 3, styleInfo); + } + + unsigned int charPos = 0; + if (!offsetToCharPos(offset, charPos, myPieces)) { + continue; + } + myStyleInfoList.push_back(CharPosToStyle(charPos, styleInfo)); + } + } + delete[] formatPageBuffer; + return true; +} + +bool OleMainStream::readSectionsInfoTable(const char *headerBuffer, const OleEntry &tableEntry) { + //PlcfSed structure is a section table + unsigned int beginOfText = OleUtil::getU4Bytes(headerBuffer, 0x18); //address of text's begin in main stream + unsigned int beginSectInfo = OleUtil::getU4Bytes(headerBuffer, 0xca); //address if PlcfSed structure + + std::size_t sectInfoLen = (std::size_t)OleUtil::getU4Bytes(headerBuffer, 0xce); //length of PlcfSed structure + if (sectInfoLen < 4) { + return false; + } + + OleStream tableStream(myStorage, tableEntry, myBaseStream); + std::string buffer; + if (!readToBuffer(buffer, beginSectInfo, sectInfoLen, tableStream)) { + return false; + } + + static const unsigned int SED_SIZE = 12; + std::size_t decriptorsCount = calcCountOfPLC(sectInfoLen, SED_SIZE); + + //saving the section offsets (in character positions) + std::vector<unsigned int> charPos; + for (std::size_t index = 0, tOffset = 0; index < decriptorsCount; ++index, tOffset += 4) { + unsigned int ulTextOffset = OleUtil::getU4Bytes(buffer.c_str(), tOffset); + charPos.push_back(beginOfText + ulTextOffset); + } + + //saving sepx offsets + std::vector<unsigned int> sectPage; + for (std::size_t index = 0, tOffset = (decriptorsCount + 1) * 4; index < decriptorsCount; ++index, tOffset += SED_SIZE) { + sectPage.push_back(OleUtil::getU4Bytes(buffer.c_str(), tOffset + 2)); + } + + //reading the section properties + char tmpBuffer[2]; + for (std::size_t index = 0; index < sectPage.size(); ++index) { + if (sectPage.at(index) == 0xffffffffUL) { //check for invalid record, to make default section info + SectionInfo sectionInfo; + sectionInfo.CharPosition = charPos.at(index); + mySectionInfoList.push_back(sectionInfo); + continue; + } + //getting number of bytes to read + if (!seek(sectPage.at(index), true)) { + continue; + } + if (read(tmpBuffer, 2) != 2) { + continue; + } + std::size_t bytes = 2 + (std::size_t)OleUtil::getU2Bytes(tmpBuffer, 0); + + if (!seek(sectPage.at(index), true)) { + continue; + } + char *formatPageBuffer = new char[bytes]; + if (read(formatPageBuffer, bytes) != bytes) { + delete[] formatPageBuffer; + continue; + } + SectionInfo sectionInfo; + sectionInfo.CharPosition = charPos.at(index); + getSectionInfo(formatPageBuffer + 2, bytes - 2, sectionInfo); + mySectionInfoList.push_back(sectionInfo); + delete[] formatPageBuffer; + } + return true; +} + +void OleMainStream::getStyleInfo(unsigned int papxOffset, const char *grpprlBuffer, unsigned int bytes, Style &styleInfo) { + int tmp, toDelete, toAdd; + unsigned int offset = 0; + while (bytes >= offset + 2) { + unsigned int curPrlLength = 0; + switch (OleUtil::getU2Bytes(grpprlBuffer, papxOffset + offset)) { + case 0x2403: + styleInfo.Alignment = (Style::AlignmentType)OleUtil::getU1Byte(grpprlBuffer, papxOffset + offset + 2); + break; + case 0x4610: + styleInfo.LeftIndent += OleUtil::getU2Bytes(grpprlBuffer, papxOffset + offset + 2); + if (styleInfo.LeftIndent < 0) { + styleInfo.LeftIndent = 0; + } + break; + case 0xc60d: // ChgTabsPapx + case 0xc615: // ChgTabs + tmp = OleUtil::get1Byte(grpprlBuffer, papxOffset + offset + 2); + if (tmp < 2) { + curPrlLength = 1; + break; + } + toDelete = OleUtil::getU1Byte(grpprlBuffer, papxOffset + offset + 3); + if (tmp < 2 + 2 * toDelete) { + curPrlLength = 1; + break; + } + toAdd = OleUtil::getU1Byte(grpprlBuffer, papxOffset + offset + 4 + 2 * toDelete); + if (tmp < 2 + 2 * toDelete + 2 * toAdd) { + curPrlLength = 1; + break; + } + break; + case 0x840e: + styleInfo.RightIndent = (int)OleUtil::getU2Bytes(grpprlBuffer, papxOffset + offset + 2); + break; + case 0x840f: + styleInfo.LeftIndent = (int)OleUtil::getU2Bytes(grpprlBuffer, papxOffset + offset + 2); + break; + case 0x8411: + styleInfo.FirstLineIndent = (int)OleUtil::getU2Bytes(grpprlBuffer, papxOffset + offset + 2); + break; + case 0xa413: + styleInfo.BeforeParagraphIndent = OleUtil::getU2Bytes(grpprlBuffer, papxOffset + offset + 2); + break; + case 0xa414: + styleInfo.AfterParagraphIndent = OleUtil::getU2Bytes(grpprlBuffer, papxOffset + offset + 2); + break; + case 0x2407: + styleInfo.HasPageBreakBefore = OleUtil::getU1Byte(grpprlBuffer, papxOffset + offset + 2) == 0x01; + break; + default: + break; + } + if (curPrlLength == 0) { + curPrlLength = getPrlLength(grpprlBuffer, papxOffset + offset); + } + offset += curPrlLength; + } + +} + +void OleMainStream::getCharInfo(unsigned int chpxOffset, unsigned int /*styleId*/, const char *grpprlBuffer, unsigned int bytes, CharInfo &charInfo) { + unsigned int sprm = 0; //single propery modifier + unsigned int offset = 0; + while (bytes >= offset + 2) { + switch (OleUtil::getU2Bytes(grpprlBuffer, chpxOffset + offset)) { + case 0x0835: //bold + sprm = OleUtil::getU1Byte(grpprlBuffer, chpxOffset + offset + 2); + switch (sprm) { + case UNSET: + charInfo.FontStyle &= ~CharInfo::FONT_BOLD; + break; + case SET: + charInfo.FontStyle |= CharInfo::FONT_BOLD; + break; + case UNCHANGED: + break; + case NEGATION: + charInfo.FontStyle ^= CharInfo::FONT_BOLD; + break; + default: + break; + } + break; + case 0x0836: //italic + sprm = OleUtil::getU1Byte(grpprlBuffer, chpxOffset + offset + 2); + switch (sprm) { + case UNSET: + charInfo.FontStyle &= ~CharInfo::FONT_ITALIC; + break; + case SET: + charInfo.FontStyle |= CharInfo::FONT_ITALIC; + break; + case UNCHANGED: + break; + case NEGATION: + charInfo.FontStyle ^= CharInfo::FONT_ITALIC; + break; + default: + break; + } + break; + case 0x4a43: //size of font + charInfo.FontSize = OleUtil::getU2Bytes(grpprlBuffer, chpxOffset + offset + 2); + break; + default: + break; + } + offset += getPrlLength(grpprlBuffer, chpxOffset + offset); + } + +} + +void OleMainStream::getSectionInfo(const char *grpprlBuffer, std::size_t bytes, SectionInfo §ionInfo) { + unsigned int tmp; + std::size_t offset = 0; + while (bytes >= offset + 2) { + switch (OleUtil::getU2Bytes(grpprlBuffer, offset)) { + case 0x3009: //new page + tmp = OleUtil::getU1Byte(grpprlBuffer, offset + 2); + sectionInfo.IsNewPage = (tmp != 0 && tmp != 1); + break; + default: + break; + } + offset += getPrlLength(grpprlBuffer, offset); + } +} + +bool OleMainStream::getInlineImageInfo(unsigned int chpxOffset, const char *grpprlBuffer, unsigned int bytes, InlineImageInfo &pictureInfo) { + //p. 105 of [MS-DOC] documentation + unsigned int offset = 0; + bool isFound = false; + while (bytes >= offset + 2) { + switch (OleUtil::getU2Bytes(grpprlBuffer, chpxOffset + offset)) { + case 0x080a: // ole object, p.107 [MS-DOC] + if (OleUtil::getU1Byte(grpprlBuffer, chpxOffset + offset + 2) == 0x01) { + return false; + } + break; + case 0x0806: // is not a picture, but a binary data? (sprmCFData, p.106 [MS-DOC]) + if (OleUtil::getU4Bytes(grpprlBuffer, chpxOffset + offset + 2) == 0x01) { + return false; + } + break; +// case 0x0855: // sprmCFSpec, p.117 [MS-DOC], MUST BE applied with a value of 1 (see p.105 [MS-DOC]) +// if (OleUtil::getU1Byte(grpprlBuffer, chpxOffset + offset + 2) != 0x01) { +// return false; +// } +// break; + case 0x6a03: // location p.105 [MS-DOC] + pictureInfo.DataPosition = OleUtil::getU4Bytes(grpprlBuffer, chpxOffset + offset + 2); + isFound = true; + break; + default: + break; + } + offset += getPrlLength(grpprlBuffer, chpxOffset + offset); + } + return isFound; +} + +OleMainStream::Style OleMainStream::getStyleFromStylesheet(unsigned int styleId, const StyleSheet &stylesheet) { + //TODO optimize it: StyleSheet can be map structure with styleId key + Style style; + if (styleId != Style::STYLE_INVALID && styleId != Style::STYLE_NIL && styleId != Style::STYLE_USER) { + for (std::size_t index = 0; index < stylesheet.size(); ++index) { + if (stylesheet.at(index).StyleIdCurrent == styleId) { + return stylesheet.at(index); + } + } + } + style.StyleIdCurrent = styleId; + return style; +} + +int OleMainStream::getStyleIndex(unsigned int styleId, const std::vector<bool> &isFilled, const StyleSheet &stylesheet) { + //TODO optimize it: StyleSheet can be map structure with styleId key + //in that case, this method will be excess + if (styleId == Style::STYLE_INVALID) { + return -1; + } + for (int index = 0; index < (int)stylesheet.size(); ++index) { + if (isFilled.at(index) && stylesheet.at(index).StyleIdCurrent == styleId) { + return index; + } + } + return -1; +} + +unsigned int OleMainStream::getStyleIdByCharPos(unsigned int charPos, const StyleInfoList &styleInfoList) { + unsigned int styleId = Style::STYLE_INVALID; + for (std::size_t i = 0; i < styleInfoList.size(); ++i) { + const Style &info = styleInfoList.at(i).second; + if (i == styleInfoList.size() - 1) { //if last + styleId = info.StyleIdCurrent; + break; + } + unsigned int curOffset = styleInfoList.at(i).first; + unsigned int nextOffset = styleInfoList.at(i + 1).first; + if (charPos >= curOffset && charPos < nextOffset) { + styleId = info.StyleIdCurrent; + break; + } + } + return styleId; +} + +bool OleMainStream::offsetToCharPos(unsigned int offset, unsigned int &charPos, const Pieces &pieces) { + if (pieces.empty()) { + return false; + } + if ((unsigned int)pieces.front().Offset > offset) { + charPos = 0; + return true; + } + if ((unsigned int)(pieces.back().Offset + pieces.back().Length) <= offset) { + return false; + } + + std::size_t pieceNumber = 0; + for (std::size_t i = 0; i < pieces.size(); ++i) { + if (i == pieces.size() - 1) { //if last + pieceNumber = i; + break; + } + unsigned int curOffset = pieces.at(i).Offset; + unsigned int nextOffset = pieces.at(i + 1).Offset; + if (offset >= curOffset && offset < nextOffset) { + pieceNumber = i; + break; + } + } + + const Piece &piece = pieces.at(pieceNumber); + unsigned int diffOffset = offset - piece.Offset; + if (!piece.IsANSI) { + diffOffset /= 2; + } + charPos = piece.startCP + diffOffset; + return true; +} + +bool OleMainStream::readToBuffer(std::string &result, unsigned int offset, std::size_t length, OleStream &stream) { + char *buffer = new char[length]; + stream.seek(offset, true); + if (stream.read(buffer, length) != length) { + return false; + } + result = std::string(buffer, length); + delete[] buffer; + return true; +} + +unsigned int OleMainStream::calcCountOfPLC(unsigned int totalSize, unsigned int elementSize) { + //calculates count of elements in PLC structure, formula from p.30 [MS-DOC] + return (totalSize - 4) / (4 + elementSize); +} + +unsigned int OleMainStream::getPrlLength(const char *grpprlBuffer, unsigned int byteNumber) { + unsigned int tmp; + unsigned int opCode = OleUtil::getU2Bytes(grpprlBuffer, byteNumber); + switch (opCode & 0xe000) { + case 0x0000: + case 0x2000: + return 3; + case 0x4000: + case 0x8000: + case 0xA000: + return 4; + case 0xE000: + return 5; + case 0x6000: + return 6; + case 0xC000: + //counting of info length + tmp = OleUtil::getU1Byte(grpprlBuffer, byteNumber + 2); + if (opCode == 0xc615 && tmp == 255) { + unsigned int del = OleUtil::getU1Byte(grpprlBuffer, byteNumber + 3); + unsigned int add = OleUtil::getU1Byte(grpprlBuffer, byteNumber + 4 + del * 4); + tmp = 2 + del * 4 + add * 3; + } + return 3 + tmp; + default: + return 1; + } +} diff --git a/fbreader/src/formats/doc/OleMainStream.h b/fbreader/src/formats/doc/OleMainStream.h new file mode 100644 index 0000000..378f037 --- /dev/null +++ b/fbreader/src/formats/doc/OleMainStream.h @@ -0,0 +1,223 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __OLEMAINSTREAM_H__ +#define __OLEMAINSTREAM_H__ + +#include <vector> +#include <string> + +#include "OleStream.h" +#include "DocFloatImageReader.h" + +class OleMainStream : public OleStream { + +public: + struct Piece { + enum PieceType { + PIECE_TEXT, + PIECE_FOOTNOTE, + PIECE_OTHER + }; + + int Offset; // TODO: maybe make it unsigned int + int Length; // TODO: maybe make it unsigned int + bool IsANSI; + PieceType Type; + unsigned int startCP; + }; + typedef std::vector<Piece> Pieces; + + struct CharInfo { + enum Font { + FONT_REGULAR = 0, + FONT_BOLD = 1 << 0, + FONT_ITALIC = 1 << 1, + FONT_UNDERLINE = 1 << 2, + FONT_CAPITALS = 1 << 3, + FONT_SMALL_CAPS = 1 << 4, + FONT_STRIKE = 1 << 5, + FONT_HIDDEN = 1 << 6, + FONT_MARKDEL = 1 << 7, + FONT_SUPERSCRIPT = 1 << 8, + FONT_SUBSCRIPT = 1 << 9 + }; + + unsigned int FontStyle; + unsigned int FontSize; + + CharInfo(); + }; + typedef std::pair<unsigned int, CharInfo> CharPosToCharInfo; + typedef std::vector<CharPosToCharInfo > CharInfoList; + + struct Style { + enum AlignmentType { + ALIGNMENT_LEFT = 0x00, + ALIGNMENT_CENTER = 0x01, + ALIGNMENT_RIGHT = 0x02, + ALIGNMENT_JUSTIFY = 0x03, + ALIGNMENT_DEFAULT // for case if alignment is not setted by word + }; + + // style Ids: + // (this is not full list of possible style ids, enum is used for using in switch-case) + enum StyleID { + STYLE_H1 = 0x1, + STYLE_H2 = 0x2, + STYLE_H3 = 0x3, + STYLE_USER = 0xFFE, + STYLE_NIL = 0xFFF, + STYLE_INVALID = 0xFFFF + }; + + unsigned int StyleIdCurrent; + unsigned int StyleIdNext; // Next style unless overruled + + bool HasPageBreakBefore; + unsigned int BeforeParagraphIndent; // Vertical indent before paragraph, pixels + unsigned int AfterParagraphIndent; // Vertical indent after paragraph, pixels + int LeftIndent; + int FirstLineIndent; + int RightIndent; + AlignmentType Alignment; + CharInfo CurrentCharInfo; + + Style(); + }; + + typedef std::pair<unsigned int, Style> CharPosToStyle; + typedef std::vector<CharPosToStyle> StyleInfoList; + typedef std::vector<Style> StyleSheet; + + struct SectionInfo { + unsigned int CharPosition; + bool IsNewPage; + + SectionInfo(); + }; + typedef std::vector<SectionInfo> SectionInfoList; + + struct Bookmark { + unsigned int CharPosition; + std::string Name; + }; + typedef std::vector<Bookmark> BookmarksList; + + struct InlineImageInfo { + unsigned int DataPosition; + + InlineImageInfo(); + }; + typedef std::pair<unsigned int, InlineImageInfo> CharPosToInlineImageInfo; + typedef std::vector<CharPosToInlineImageInfo> InlineImageInfoList; + + struct FloatImageInfo { + unsigned int ShapeId; + FloatImageInfo(); + }; + typedef std::pair<unsigned int, FloatImageInfo> CharPosToFloatImageInfo; + typedef std::vector<CharPosToFloatImageInfo> FloatImageInfoList; + + enum ImageType { //see p. 60 [MS-ODRAW] + IMAGE_EMF = 0xF01A, + IMAGE_WMF = 0xF01B, + IMAGE_PICT = 0xF01C, + IMAGE_JPEG = 0xF01D, + IMAGE_PNG = 0xF01E, + IMAGE_DIB = 0xF01F, + IMAGE_TIFF = 0xF029, + IMAGE_JPEG2 = 0xF02A + }; + +public: + OleMainStream(shared_ptr<OleStorage> storage, OleEntry oleEntry, shared_ptr<ZLInputStream> stream); + +public: + bool open(bool doReadFormattingData); + const Pieces &getPieces() const; + const CharInfoList &getCharInfoList() const; + const StyleInfoList &getStyleInfoList() const; + const BookmarksList &getBookmarks() const; + const InlineImageInfoList &getInlineImageInfoList() const; + const FloatImageInfoList &getFloatImageInfoList() const; + + ZLFileImage::Blocks getFloatImage(unsigned int shapeId) const; + ZLFileImage::Blocks getInlineImage(unsigned int dataPos) const; + +private: + bool readFIB(const char *headerBuffer); + bool readPieceTable(const char *headerBuffer, const OleEntry &tableEntry); + bool readBookmarks(const char *headerBuffer, const OleEntry &tableEntry); + bool readStylesheet(const char *headerBuffer, const OleEntry &tableEntry); + bool readSectionsInfoTable(const char *headerBuffer, const OleEntry &tableEntry); + bool readParagraphStyleTable(const char *headerBuffer, const OleEntry &tableEntry); + bool readCharInfoTable(const char *headerBuffer, const OleEntry &tableEntry); + bool readFloatingImages(const char *headerBuffer, const OleEntry &tableEntry); + +private: //readPieceTable helpers methods + static std::string getPiecesTableBuffer(const char *headerBuffer, OleStream &tableStream); + static void splitPieces(const Pieces &source, Pieces &dest1, Pieces &dest2, Piece::PieceType type1, Piece::PieceType type2, int boundary); + +private: //formatting reader helpers methods + static unsigned int getPrlLength(const char *grpprlBuffer, unsigned int byteNumber); + static void getCharInfo(unsigned int chpxOffset, unsigned int styleId, const char *grpprlBuffer, unsigned int bytes, CharInfo &charInfo); + static void getStyleInfo(unsigned int papxOffset, const char *grpprlBuffer, unsigned int bytes, Style &styleInfo); + static void getSectionInfo(const char *grpprlBuffer, std::size_t bytes, SectionInfo §ionInfo); + static bool getInlineImageInfo(unsigned int chpxOffset, const char *grpprlBuffer, unsigned int bytes, InlineImageInfo &pictureInfo); + + static Style getStyleFromStylesheet(unsigned int styleId, const StyleSheet &stylesheet); + static int getStyleIndex(unsigned int styleId, const std::vector<bool> &isFilled, const StyleSheet &stylesheet); + static unsigned int getStyleIdByCharPos(unsigned int offset, const StyleInfoList &styleInfoList); + + static bool offsetToCharPos(unsigned int offset, unsigned int &charPos, const Pieces &pieces); + static bool readToBuffer(std::string &result, unsigned int offset, std::size_t length, OleStream &stream); + + static unsigned int calcCountOfPLC(unsigned int totalSize, unsigned int elementSize); + +private: + enum PrlFlag { + UNSET = 0, + SET = 1, + UNCHANGED = 128, + NEGATION = 129 + }; + +private: + int myStartOfText; + int myEndOfText; + + Pieces myPieces; + + StyleSheet myStyleSheet; + + CharInfoList myCharInfoList; + StyleInfoList myStyleInfoList; + SectionInfoList mySectionInfoList; + InlineImageInfoList myInlineImageInfoList; + FloatImageInfoList myFloatImageInfoList; + + BookmarksList myBookmarks; + + shared_ptr<OleStream> myDataStream; + + shared_ptr<DocFloatImageReader> myFLoatImageReader; +}; + +#endif /* __OLEMAINSTREAM_H__ */ diff --git a/fbreader/src/formats/doc/OleStorage.cpp b/fbreader/src/formats/doc/OleStorage.cpp new file mode 100644 index 0000000..a7ab81a --- /dev/null +++ b/fbreader/src/formats/doc/OleStorage.cpp @@ -0,0 +1,304 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLLogger.h> + +#include "OleStorage.h" +#include "OleUtil.h" + +#include <cstring> + +const std::size_t OleStorage::BBD_BLOCK_SIZE = 512; + +OleStorage::OleStorage() { + clear(); +} + +void OleStorage::clear() { + myInputStream = 0; + mySectorSize = 0; + myShortSectorSize = 0; + myStreamSize = 0; + myRootEntryIndex = -1; + + myDIFAT.clear(); + myBBD.clear(); + mySBD.clear(); + myProperties.clear(); + myEntries.clear(); +} + + + +bool OleStorage::init(shared_ptr<ZLInputStream> stream, std::size_t streamSize) { + clear(); + + myInputStream = stream; + myStreamSize = streamSize; + myInputStream->seek(0, true); + + char oleBuf[BBD_BLOCK_SIZE]; + std::size_t ret = myInputStream->read(oleBuf, BBD_BLOCK_SIZE); + if (ret != BBD_BLOCK_SIZE) { + clear(); + return false; + } + static const char OLE_SIGN[] = {(char)0xD0, (char)0xCF, (char)0x11, (char)0xE0, (char)0xA1, (char)0xB1, (char)0x1A, (char)0xE1, 0}; + if (std::strncmp(oleBuf, OLE_SIGN, 8) != 0) { + clear(); + return false; + } + mySectorSize = 1 << OleUtil::getU2Bytes(oleBuf, 0x1e); //offset for value of big sector size + myShortSectorSize = 1 << OleUtil::getU2Bytes(oleBuf, 0x20); //offset for value of small sector size + + if (readDIFAT(oleBuf) && readBBD(oleBuf) && readSBD(oleBuf) && readProperties(oleBuf) && readAllEntries()) { + return true; + } + clear(); + return false; +} + +bool OleStorage::readDIFAT(char *oleBuf) { + int difatBlock = OleUtil::get4Bytes(oleBuf, 0x44); //address for first difat sector + int difatSectorNumbers = OleUtil::get4Bytes(oleBuf, 0x48); //numbers of additional difat records + + //436 of difat records are stored in header, by offset 0x4c + for (unsigned int i = 0; i < 436; i += 4) { + myDIFAT.push_back(OleUtil::get4Bytes(oleBuf + 0x4c, i)); + } + + //for files > 6.78 mb we need read additional DIFAT fields + for (int i = 0; difatBlock > 0 && i < difatSectorNumbers; ++i) { + ZLLogger::Instance().println("DocPlugin", "Read additional data for DIFAT"); + char buffer[mySectorSize]; + myInputStream->seek(BBD_BLOCK_SIZE + difatBlock * mySectorSize, true); + if (myInputStream->read(buffer, mySectorSize) != mySectorSize) { + ZLLogger::Instance().println("DocPlugin", "Error read DIFAT!"); + return false; + } + for (unsigned int j = 0; j < (mySectorSize - 4); j += 4) { + myDIFAT.push_back(OleUtil::get4Bytes(buffer, j)); + } + difatBlock = OleUtil::get4Bytes(buffer, mySectorSize - 4); //next DIFAT block is pointed at the end of the sector + } + + //removing unusable DIFAT links + //0xFFFFFFFF means "free section" + while (!myDIFAT.empty() && myDIFAT.back() == (int)0xFFFFFFFF) { + myDIFAT.pop_back(); + } + return true; +} + +bool OleStorage::readBBD(char *oleBuf) { + char buffer[mySectorSize]; + unsigned int bbdNumberBlocks = OleUtil::getU4Bytes(oleBuf, 0x2c); //number of big blocks + + if (myDIFAT.size() < bbdNumberBlocks) { + //TODO maybe add check on myDIFAT == bbdNumberBlocks + ZLLogger::Instance().println("DocPlugin", "Wrong number of FAT blocks value"); + return false; + } + + for (unsigned int i = 0; i < bbdNumberBlocks; ++i) { + int bbdSector = myDIFAT.at(i); + if (bbdSector >= (int)(myStreamSize / mySectorSize) || bbdSector < 0) { + ZLLogger::Instance().println("DocPlugin", "Bad BBD entry!"); + return false; + } + myInputStream->seek(BBD_BLOCK_SIZE + bbdSector * mySectorSize, true); + if (myInputStream->read(buffer, mySectorSize) != mySectorSize) { + ZLLogger::Instance().println("DocPlugin", "Error during reading BBD!"); + return false; + } + for (unsigned int j = 0; j < mySectorSize; j += 4) { + myBBD.push_back(OleUtil::get4Bytes(buffer, j)); + } + } + return true; +} + +bool OleStorage::readSBD(char *oleBuf) { + int sbdCur = OleUtil::get4Bytes(oleBuf, 0x3c); //address of first small sector + int sbdCount = OleUtil::get4Bytes(oleBuf, 0x40); //count of small sectors + + if (sbdCur <= 0) { + ZLLogger::Instance().println("DocPlugin", "There's no SBD, don't read it"); + return true; + } + + char buffer[mySectorSize]; + for (int i = 0; i < sbdCount; ++i) { + if (i != 0) { + if (sbdCur < 0 || (unsigned int)sbdCur >= myBBD.size()) { + ZLLogger::Instance().println("DocPlugin", "error during parsing SBD"); + return false; + } + sbdCur = myBBD.at(sbdCur); + } + if (sbdCur <= 0) { + break; + } + myInputStream->seek(BBD_BLOCK_SIZE + sbdCur * mySectorSize, true); + if (myInputStream->read(buffer, mySectorSize) != mySectorSize) { + ZLLogger::Instance().println("DocPlugin", "reading error during parsing SBD"); + return false; + } + for (unsigned int j = 0; j < mySectorSize; j += 4) { + mySBD.push_back(OleUtil::get4Bytes(buffer, j)); + } + + } + return true; +} + +bool OleStorage::readProperties(char *oleBuf) { + int propCur = OleUtil::get4Bytes(oleBuf, 0x30); //offset for address of sector with first property + if (propCur < 0) { + ZLLogger::Instance().println("DocPlugin", "Wrong first directory sector location"); + return false; + } + + char buffer[mySectorSize]; + do { + myInputStream->seek(BBD_BLOCK_SIZE + propCur * mySectorSize, true); + if (myInputStream->read(buffer, mySectorSize) != mySectorSize) { + ZLLogger::Instance().println("DocPlugin", "Error during reading properties"); + return false; + } + for (unsigned int j = 0; j < mySectorSize; j += 128) { + myProperties.push_back(std::string(buffer + j, 128)); + } + if (propCur < 0 || (std::size_t)propCur >= myBBD.size()) { + break; + } + propCur = myBBD.at(propCur); + } while (propCur >= 0 && propCur < (int)(myStreamSize / mySectorSize)); + return true; +} + +bool OleStorage::readAllEntries() { + int propCount = myProperties.size(); + for (int i = 0; i < propCount; ++i) { + OleEntry entry; + bool result = readOleEntry(i, entry); + if (!result) { + break; + } + if (entry.type == OleEntry::ROOT_DIR) { + myRootEntryIndex = i; + } + myEntries.push_back(entry); + } + if (myRootEntryIndex < 0) { + return false; + } + return true; +} + +bool OleStorage::readOleEntry(int propNumber, OleEntry &e) { + static const std::string ROOT_ENTRY = "Root Entry"; + + std::string property = myProperties.at(propNumber); + + char oleType = property.at(0x42); //offset for Ole Type + if (oleType != 1 && oleType != 2 && oleType != 3 && oleType != 5) { + ZLLogger::Instance().println("DocPlugin", "entry -- not right ole type"); + return false; + } + + e.type = (OleEntry::Type)oleType; + + int nameLength = OleUtil::getU2Bytes(property.c_str(), 0x40); //offset for value entry's name length + e.name.clear(); + e.name.reserve(33); //max size of entry name + + if ((unsigned int)nameLength >= property.size()) { + return false; + } + for (int i = 0; i < nameLength; i+=2) { + char c = property.at(i); + if (c != 0) { + e.name += c; + } + } + + e.length = OleUtil::getU4Bytes(property.c_str(), 0x78); //offset for entry's length value + e.isBigBlock = e.length >= 0x1000 || e.name == ROOT_ENTRY; + + // Read sector chain + if (property.size() < 0x74 + 4) { + ZLLogger::Instance().println("DocPlugin", "problems with reading ole entry"); + return false; + } + int chainCur = OleUtil::get4Bytes(property.c_str(), 0x74); //offset for start block of entry + if (chainCur >= 0 && (chainCur <= (int)(myStreamSize / (e.isBigBlock ? mySectorSize : myShortSectorSize)))) { + //filling blocks with chains + do { + e.blocks.push_back((unsigned int)chainCur); + if (e.isBigBlock && (std::size_t)chainCur < myBBD.size()) { + chainCur = myBBD.at(chainCur); + } else if (!mySBD.empty() && (std::size_t)chainCur < mySBD.size()) { + chainCur = mySBD.at(chainCur); + } else { + chainCur = -1; + } + } while (chainCur > 0 && + chainCur < (int)(e.isBigBlock ? myBBD.size() : mySBD.size()) && + e.blocks.size() <= e.length / (e.isBigBlock ? mySectorSize : myShortSectorSize)); + } + e.length = std::min(e.length, (unsigned int)((e.isBigBlock ? mySectorSize : myShortSectorSize) * e.blocks.size())); + return true; +} + +bool OleStorage::countFileOffsetOfBlock(const OleEntry &e, unsigned int blockNumber, unsigned int &result) const { + //TODO maybe better syntax can be used? + if (e.blocks.size() <= (std::size_t)blockNumber) { + ZLLogger::Instance().println("DocPlugin", "countFileOffsetOfBlock can't be done, blockNumber is invalid"); + return false; + } + if (e.isBigBlock) { + result = BBD_BLOCK_SIZE + e.blocks.at(blockNumber) * mySectorSize; + } else { + unsigned int sbdPerSector = mySectorSize / myShortSectorSize; + unsigned int sbdSectorNumber = e.blocks.at(blockNumber) / sbdPerSector; + unsigned int sbdSectorMod = e.blocks.at(blockNumber) % sbdPerSector; + if (myEntries.at(myRootEntryIndex).blocks.size() <= (std::size_t)sbdSectorNumber) { + ZLLogger::Instance().println("DocPlugin", "countFileOffsetOfBlock can't be done, invalid sbd data"); + return false; + } + result = BBD_BLOCK_SIZE + myEntries.at(myRootEntryIndex).blocks.at(sbdSectorNumber) * mySectorSize + sbdSectorMod * myShortSectorSize; + } + return true; +} + +bool OleStorage::getEntryByName(std::string name, OleEntry &returnEntry) const { + //TODO fix the workaround for duplicates streams: now it takes a stream with max length + unsigned int maxLength = 0; + for (std::size_t i = 0; i < myEntries.size(); ++i) { + const OleEntry &entry = myEntries.at(i); + if (entry.name == name && entry.length >= maxLength) { + returnEntry = entry; + maxLength = entry.length; + } + } + return maxLength > 0; +} + + diff --git a/fbreader/src/formats/doc/OleStorage.h b/fbreader/src/formats/doc/OleStorage.h new file mode 100644 index 0000000..584ee94 --- /dev/null +++ b/fbreader/src/formats/doc/OleStorage.h @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __OLESTORAGE_H__ +#define __OLESTORAGE_H__ + +#include <algorithm> +#include <vector> +#include <string> + +#include <ZLInputStream.h> + +struct OleEntry { + enum Type { + DIR = 1, + STREAM = 2, + ROOT_DIR = 5, + LOCK_BYTES =3 + }; + + typedef std::vector<unsigned int> Blocks; + + std::string name; + unsigned int length; + Type type; + Blocks blocks; + bool isBigBlock; +}; + +class OleStorage { + +public: + static const std::size_t BBD_BLOCK_SIZE; + +public: + OleStorage(); + bool init(shared_ptr<ZLInputStream>, std::size_t streamSize); + void clear(); + const std::vector<OleEntry> &getEntries() const; + bool getEntryByName(std::string name, OleEntry &entry) const; + + unsigned int getSectorSize() const; + unsigned int getShortSectorSize() const; + +public: //TODO make private + bool countFileOffsetOfBlock(const OleEntry &e, unsigned int blockNumber, unsigned int &result) const; + +private: + bool readDIFAT(char *oleBuf); + bool readBBD(char *oleBuf); + bool readSBD(char *oleBuf); + bool readProperties(char *oleBuf); + + bool readAllEntries(); + bool readOleEntry(int propNumber, OleEntry &entry); + +private: + + shared_ptr<ZLInputStream> myInputStream; + unsigned int mySectorSize, myShortSectorSize; + + std::size_t myStreamSize; + std::vector<int> myDIFAT; //double-indirect file allocation table + std::vector<int> myBBD; //Big Block Depot + std::vector<int> mySBD; //Small Block Depot + std::vector<std::string> myProperties; + std::vector<OleEntry> myEntries; + int myRootEntryIndex; + +}; + +inline const std::vector<OleEntry> &OleStorage::getEntries() const { return myEntries; } +inline unsigned int OleStorage::getSectorSize() const { return mySectorSize; } +inline unsigned int OleStorage::getShortSectorSize() const { return myShortSectorSize; } + +#endif /* __OLESTORAGE_H__ */ diff --git a/fbreader/src/formats/doc/OleStream.cpp b/fbreader/src/formats/doc/OleStream.cpp new file mode 100644 index 0000000..8de1cc4 --- /dev/null +++ b/fbreader/src/formats/doc/OleStream.cpp @@ -0,0 +1,221 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLLogger.h> + +#include "OleStream.h" +#include "OleUtil.h" + +OleStream::OleStream(shared_ptr<OleStorage> storage, OleEntry oleEntry, shared_ptr<ZLInputStream> stream) : + myStorage(storage), + myOleEntry(oleEntry), + myBaseStream(stream) { + myOleOffset = 0; +} + + +bool OleStream::open() { + if (myOleEntry.type != OleEntry::STREAM) { + return false; + } + return true; +} + +std::size_t OleStream::read(char *buffer, std::size_t maxSize) { + std::size_t length = maxSize; + std::size_t readedBytes = 0; + std::size_t bytesLeftInCurBlock; + unsigned int newFileOffset; + + unsigned int curBlockNumber, modBlock; + std::size_t toReadBlocks, toReadBytes; + + if (myOleOffset + length > myOleEntry.length) { + length = myOleEntry.length - myOleOffset; + } + + std::size_t sectorSize = (std::size_t)(myOleEntry.isBigBlock ? myStorage->getSectorSize() : myStorage->getShortSectorSize()); + + curBlockNumber = myOleOffset / sectorSize; + if (curBlockNumber >= myOleEntry.blocks.size()) { + return 0; + } + modBlock = myOleOffset % sectorSize; + bytesLeftInCurBlock = sectorSize - modBlock; + if (bytesLeftInCurBlock < length) { + toReadBlocks = (length - bytesLeftInCurBlock) / sectorSize; + toReadBytes = (length - bytesLeftInCurBlock) % sectorSize; + } else { + toReadBlocks = toReadBytes = 0; + } + + if (!myStorage->countFileOffsetOfBlock(myOleEntry, curBlockNumber, newFileOffset)) { + return 0; + } + newFileOffset += modBlock; + + myBaseStream->seek(newFileOffset, true); + + readedBytes = myBaseStream->read(buffer, std::min(length, bytesLeftInCurBlock)); + for (std::size_t i = 0; i < toReadBlocks; ++i) { + if (++curBlockNumber >= myOleEntry.blocks.size()) { + break; + } + if (!myStorage->countFileOffsetOfBlock(myOleEntry, curBlockNumber, newFileOffset)) { + return readedBytes; + } + myBaseStream->seek(newFileOffset, true); + readedBytes += myBaseStream->read(buffer + readedBytes, std::min(length - readedBytes, sectorSize)); + } + if (toReadBytes > 0 && ++curBlockNumber < myOleEntry.blocks.size()) { + if (!myStorage->countFileOffsetOfBlock(myOleEntry, curBlockNumber, newFileOffset)) { + return readedBytes; + } + myBaseStream->seek(newFileOffset, true); + readedBytes += myBaseStream->read(buffer + readedBytes, toReadBytes); + } + myOleOffset += readedBytes; + return readedBytes; +} + +bool OleStream::eof() const { + return (myOleOffset >= myOleEntry.length); +} + + +void OleStream::close() { +} + +bool OleStream::seek(unsigned int offset, bool absoluteOffset) { + unsigned int newOleOffset = 0; + unsigned int newFileOffset; + + if (absoluteOffset) { + newOleOffset = offset; + } else { + newOleOffset = myOleOffset + offset; + } + + newOleOffset = std::min(newOleOffset, myOleEntry.length); + + unsigned int sectorSize = (myOleEntry.isBigBlock ? myStorage->getSectorSize() : myStorage->getShortSectorSize()); + unsigned int blockNumber = newOleOffset / sectorSize; + if (blockNumber >= myOleEntry.blocks.size()) { + return false; + } + + unsigned int modBlock = newOleOffset % sectorSize; + if (!myStorage->countFileOffsetOfBlock(myOleEntry, blockNumber, newFileOffset)) { + return false; + } + newFileOffset += modBlock; + myBaseStream->seek(newFileOffset, true); + myOleOffset = newOleOffset; + return true; +} + +std::size_t OleStream::offset() { + return myOleOffset; +} + +ZLFileImage::Blocks OleStream::getBlockPieceInfoList(unsigned int offset, unsigned int size) const { + ZLFileImage::Blocks list; + unsigned int sectorSize = (myOleEntry.isBigBlock ? myStorage->getSectorSize() : myStorage->getShortSectorSize()); + unsigned int curBlockNumber = offset / sectorSize; + if (curBlockNumber >= myOleEntry.blocks.size()) { + return list; + } + unsigned int modBlock = offset % sectorSize; + unsigned int startFileOffset = 0; + if (!myStorage->countFileOffsetOfBlock(myOleEntry, curBlockNumber, startFileOffset)) { + return ZLFileImage::Blocks(); + } + startFileOffset += modBlock; + + unsigned int bytesLeftInCurBlock = sectorSize - modBlock; + unsigned int toReadBlocks = 0, toReadBytes = 0; + if (bytesLeftInCurBlock < size) { + toReadBlocks = (size - bytesLeftInCurBlock) / sectorSize; + toReadBytes = (size - bytesLeftInCurBlock) % sectorSize; + } + + unsigned int readedBytes = std::min(size, bytesLeftInCurBlock); + list.push_back(ZLFileImage::Block(startFileOffset, readedBytes)); + + for (unsigned int i = 0; i < toReadBlocks; ++i) { + if (++curBlockNumber >= myOleEntry.blocks.size()) { + break; + } + unsigned int newFileOffset = 0; + if (!myStorage->countFileOffsetOfBlock(myOleEntry, curBlockNumber, newFileOffset)) { + return ZLFileImage::Blocks(); + } + unsigned int readbytes = std::min(size - readedBytes, sectorSize); + list.push_back(ZLFileImage::Block(newFileOffset, readbytes)); + readedBytes += readbytes; + } + if (toReadBytes > 0 && ++curBlockNumber < myOleEntry.blocks.size()) { + unsigned int newFileOffset = 0; + if (!myStorage->countFileOffsetOfBlock(myOleEntry, curBlockNumber, newFileOffset)) { + return ZLFileImage::Blocks(); + } + unsigned int readbytes = toReadBytes; + list.push_back(ZLFileImage::Block(newFileOffset, readbytes)); + readedBytes += readbytes; + } + + return concatBlocks(list); +} + +ZLFileImage::Blocks OleStream::concatBlocks(const ZLFileImage::Blocks &blocks) { + if (blocks.size() < 2) { + return blocks; + } + ZLFileImage::Blocks optList; + ZLFileImage::Block curBlock = blocks.at(0); + unsigned int nextOffset = curBlock.offset + curBlock.size; + for (std::size_t i = 1; i < blocks.size(); ++i) { + ZLFileImage::Block b = blocks.at(i); + if (b.offset == nextOffset) { + curBlock.size += b.size; + nextOffset += b.size; + } else { + optList.push_back(curBlock); + curBlock = b; + nextOffset = curBlock.offset + curBlock.size; + } + } + optList.push_back(curBlock); + return optList; +} + +std::size_t OleStream::fileOffset() { + //TODO maybe remove this method, it doesn't use at this time + std::size_t sectorSize = (std::size_t)(myOleEntry.isBigBlock ? myStorage->getSectorSize() : myStorage->getShortSectorSize()); + unsigned int curBlockNumber = myOleOffset / sectorSize; + if (curBlockNumber >= myOleEntry.blocks.size()) { + return 0; + } + unsigned int modBlock = myOleOffset % sectorSize; + unsigned int curOffset = 0; + if (!myStorage->countFileOffsetOfBlock(myOleEntry, curBlockNumber, curOffset)) { + return 0; //TODO maybe remove -1? + } + return curOffset + modBlock; +} diff --git a/fbreader/src/formats/doc/OleStream.h b/fbreader/src/formats/doc/OleStream.h new file mode 100644 index 0000000..861c7cb --- /dev/null +++ b/fbreader/src/formats/doc/OleStream.h @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __OLESTREAM_H__ +#define __OLESTREAM_H__ + +#include <ZLFileImage.h> + +#include "OleStorage.h" + +class OleStream { + +public: + OleStream(shared_ptr<OleStorage> storage, OleEntry oleEntry, shared_ptr<ZLInputStream> stream); + +public: + bool open(); + std::size_t read(char *buffer, std::size_t maxSize); + void close(); + +public: + bool seek(unsigned int offset, bool absoluteOffset); + std::size_t offset(); + +public: + ZLFileImage::Blocks getBlockPieceInfoList(unsigned int offset, unsigned int size) const; + static ZLFileImage::Blocks concatBlocks(const ZLFileImage::Blocks &blocks); + std::size_t fileOffset(); + +public: + bool eof() const; + +protected: + shared_ptr<OleStorage> myStorage; + + OleEntry myOleEntry; + shared_ptr<ZLInputStream> myBaseStream; + + unsigned int myOleOffset; +}; + +#endif /* __OLESTREAM_H__ */ diff --git a/fbreader/src/formats/doc/OleStreamParser.cpp b/fbreader/src/formats/doc/OleStreamParser.cpp new file mode 100644 index 0000000..0a9c62d --- /dev/null +++ b/fbreader/src/formats/doc/OleStreamParser.cpp @@ -0,0 +1,210 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +//#include <cctype> +//#include <cstring> + +#include <ZLLogger.h> + +#include "OleMainStream.h" +#include "OleUtil.h" +#include "OleStreamParser.h" + +//word's control chars: +const ZLUnicodeUtil::Ucs2Char OleStreamParser::WORD_FOOTNOTE_MARK = 0x0002; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::WORD_TABLE_SEPARATOR = 0x0007; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::WORD_HORIZONTAL_TAB = 0x0009; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::WORD_HARD_LINEBREAK = 0x000b; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::WORD_PAGE_BREAK = 0x000c; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::WORD_END_OF_PARAGRAPH = 0x000d; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::WORD_MINUS = 0x001e; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::WORD_SOFT_HYPHEN = 0x001f; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::WORD_START_FIELD = 0x0013; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::WORD_SEPARATOR_FIELD = 0x0014; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::WORD_END_FIELD = 0x0015; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::WORD_ZERO_WIDTH_UNBREAKABLE_SPACE = 0xfeff; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::INLINE_IMAGE = 0x0001; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::FLOAT_IMAGE = 0x0008; + +//unicode values: +const ZLUnicodeUtil::Ucs2Char OleStreamParser::NULL_SYMBOL = 0x0; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::FILE_SEPARATOR = 0x1c; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::LINE_FEED = 0x000a; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::SOFT_HYPHEN = 0xad; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::SPACE = 0x20; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::MINUS = 0x2D; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::VERTICAL_LINE = 0x7C; + +OleStreamParser::OleStreamParser() { + myCurBufferPosition = 0; + + myCurCharPos = 0; + myNextStyleInfoIndex = 0; + myNextCharInfoIndex = 0; + myNextBookmarkIndex = 0; + myNextInlineImageInfoIndex = 0; + myNextFloatImageInfoIndex = 0; +} + +bool OleStreamParser::readStream(OleMainStream &oleMainStream) { + ZLUnicodeUtil::Ucs2Char ucs2char; + bool tabMode = false; + while (getUcs2Char(oleMainStream, ucs2char)) { + if (tabMode) { + tabMode = false; + if (ucs2char == WORD_TABLE_SEPARATOR) { + handleTableEndRow(); + continue; + } else { + handleTableSeparator(); + } + } + + if (ucs2char < 32) { + switch (ucs2char) { + case NULL_SYMBOL: + break; + case WORD_HARD_LINEBREAK: + handleHardLinebreak(); + break; + case WORD_END_OF_PARAGRAPH: + case WORD_PAGE_BREAK: + handleParagraphEnd(); + break; + case WORD_TABLE_SEPARATOR: + tabMode = true; + break; + case WORD_FOOTNOTE_MARK: + handleFootNoteMark(); + break; + case WORD_START_FIELD: + handleStartField(); + break; + case WORD_SEPARATOR_FIELD: + handleSeparatorField(); + break; + case WORD_END_FIELD: + handleEndField(); + break; + case INLINE_IMAGE: + case FLOAT_IMAGE: + break; + default: + handleOtherControlChar(ucs2char); + break; + } + } else if (ucs2char == WORD_ZERO_WIDTH_UNBREAKABLE_SPACE) { + continue; //skip + } else { + handleChar(ucs2char); + } + } + + return true; +} + +bool OleStreamParser::getUcs2Char(OleMainStream &stream, ZLUnicodeUtil::Ucs2Char &ucs2char) { + while (myCurBufferPosition >= myBuffer.size()) { + myBuffer.clear(); + myCurBufferPosition = 0; + if (!readNextPiece(stream)) { + return false; + } + } + ucs2char = myBuffer.at(myCurBufferPosition++); + processStyles(stream); + + switch (ucs2char) { + case INLINE_IMAGE: + processInlineImage(stream); + break; + case FLOAT_IMAGE: + processFloatImage(stream); + break; + } + ++myCurCharPos; + return true; +} + +void OleStreamParser::processInlineImage(OleMainStream &stream) { + const OleMainStream::InlineImageInfoList &imageInfoList = stream.getInlineImageInfoList(); + if (imageInfoList.empty()) { + return; + } + //seek to curCharPos, because not all entries are real pictures + while(myNextInlineImageInfoIndex < imageInfoList.size() && imageInfoList.at(myNextInlineImageInfoIndex).first < myCurCharPos) { + ++myNextInlineImageInfoIndex; + } + while (myNextInlineImageInfoIndex < imageInfoList.size() && imageInfoList.at(myNextInlineImageInfoIndex).first == myCurCharPos) { + OleMainStream::InlineImageInfo info = imageInfoList.at(myNextInlineImageInfoIndex).second; + ZLFileImage::Blocks list = stream.getInlineImage(info.DataPosition); + if (!list.empty()) { + handleImage(list); + } + ++myNextInlineImageInfoIndex; + } +} + +void OleStreamParser::processFloatImage(OleMainStream &stream) { + const OleMainStream::FloatImageInfoList &imageInfoList = stream.getFloatImageInfoList(); + if (imageInfoList.empty()) { + return; + } + //seek to curCharPos, because not all entries are real pictures + while(myNextFloatImageInfoIndex < imageInfoList.size() && imageInfoList.at(myNextFloatImageInfoIndex).first < myCurCharPos) { + ++myNextFloatImageInfoIndex; + } + while (myNextFloatImageInfoIndex < imageInfoList.size() && imageInfoList.at(myNextFloatImageInfoIndex).first == myCurCharPos) { + OleMainStream::FloatImageInfo info = imageInfoList.at(myNextFloatImageInfoIndex).second; + ZLFileImage::Blocks list = stream.getFloatImage(info.ShapeId); + if (!list.empty()) { + handleImage(list); + } + ++myNextFloatImageInfoIndex; + } +} + +void OleStreamParser::processStyles(OleMainStream &stream) { + const OleMainStream::StyleInfoList &styleInfoList = stream.getStyleInfoList(); + if (!styleInfoList.empty()) { + while (myNextStyleInfoIndex < styleInfoList.size() && styleInfoList.at(myNextStyleInfoIndex).first == myCurCharPos) { + OleMainStream::Style info = styleInfoList.at(myNextStyleInfoIndex).second; + handleParagraphStyle(info); + ++myNextStyleInfoIndex; + } + } + + const OleMainStream::CharInfoList &charInfoList = stream.getCharInfoList(); + if (!charInfoList.empty()) { + while (myNextCharInfoIndex < charInfoList.size() && charInfoList.at(myNextCharInfoIndex).first == myCurCharPos) { + OleMainStream::CharInfo info = charInfoList.at(myNextCharInfoIndex).second; + handleFontStyle(info.FontStyle); + ++myNextCharInfoIndex; + } + } + + const OleMainStream::BookmarksList &bookmarksList = stream.getBookmarks(); + if (!bookmarksList.empty()) { + while (myNextBookmarkIndex < bookmarksList.size() && bookmarksList.at(myNextBookmarkIndex).CharPosition == myCurCharPos) { + OleMainStream::Bookmark bookmark = bookmarksList.at(myNextBookmarkIndex); + handleBookmark(bookmark.Name); + ++myNextBookmarkIndex; + } + } +} diff --git a/fbreader/src/formats/doc/OleStreamParser.h b/fbreader/src/formats/doc/OleStreamParser.h new file mode 100644 index 0000000..1adec2f --- /dev/null +++ b/fbreader/src/formats/doc/OleStreamParser.h @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __OLESTREAMPARSER_H__ +#define __OLESTREAMPARSER_H__ + +#include <ZLUnicodeUtil.h> + +#include "OleMainStream.h" +#include "OleStreamReader.h" + +class OleStreamParser : public OleStreamReader { + +public: + //word's control chars: + static const ZLUnicodeUtil::Ucs2Char WORD_FOOTNOTE_MARK; + static const ZLUnicodeUtil::Ucs2Char WORD_TABLE_SEPARATOR; + static const ZLUnicodeUtil::Ucs2Char WORD_HORIZONTAL_TAB; + static const ZLUnicodeUtil::Ucs2Char WORD_HARD_LINEBREAK; + static const ZLUnicodeUtil::Ucs2Char WORD_PAGE_BREAK; + static const ZLUnicodeUtil::Ucs2Char WORD_END_OF_PARAGRAPH; + static const ZLUnicodeUtil::Ucs2Char WORD_MINUS; + static const ZLUnicodeUtil::Ucs2Char WORD_SOFT_HYPHEN; + static const ZLUnicodeUtil::Ucs2Char WORD_START_FIELD; + static const ZLUnicodeUtil::Ucs2Char WORD_SEPARATOR_FIELD; + static const ZLUnicodeUtil::Ucs2Char WORD_END_FIELD; + static const ZLUnicodeUtil::Ucs2Char WORD_ZERO_WIDTH_UNBREAKABLE_SPACE; + static const ZLUnicodeUtil::Ucs2Char INLINE_IMAGE; + static const ZLUnicodeUtil::Ucs2Char FLOAT_IMAGE; + + //unicode values: + static const ZLUnicodeUtil::Ucs2Char NULL_SYMBOL; + static const ZLUnicodeUtil::Ucs2Char FILE_SEPARATOR; + static const ZLUnicodeUtil::Ucs2Char LINE_FEED; + static const ZLUnicodeUtil::Ucs2Char SOFT_HYPHEN; + static const ZLUnicodeUtil::Ucs2Char SPACE; + static const ZLUnicodeUtil::Ucs2Char MINUS; + static const ZLUnicodeUtil::Ucs2Char VERTICAL_LINE; + +public: + OleStreamParser(); + +private: + bool readStream(OleMainStream &stream); + +protected: + virtual void handleChar(ZLUnicodeUtil::Ucs2Char ucs2char) = 0; + virtual void handleHardLinebreak() = 0; + virtual void handleParagraphEnd() = 0; + virtual void handlePageBreak() = 0; + virtual void handleTableSeparator() = 0; + virtual void handleTableEndRow() = 0; + virtual void handleFootNoteMark() = 0; + virtual void handleStartField() = 0; + virtual void handleSeparatorField() = 0; + virtual void handleEndField() = 0; + virtual void handleImage(const ZLFileImage::Blocks &blocks) = 0; + virtual void handleOtherControlChar(ZLUnicodeUtil::Ucs2Char ucs2char) = 0; + + virtual void handleFontStyle(unsigned int fontStyle) = 0; + virtual void handleParagraphStyle(const OleMainStream::Style &styleInfo) = 0; + virtual void handleBookmark(const std::string &name) = 0; + +private: + bool getUcs2Char(OleMainStream &stream, ZLUnicodeUtil::Ucs2Char &ucs2char); + void processInlineImage(OleMainStream &stream); + void processFloatImage(OleMainStream &stream); + void processStyles(OleMainStream &stream); + +private: +protected: + ZLUnicodeUtil::Ucs2String myBuffer; +private: + std::size_t myCurBufferPosition; + + unsigned int myCurCharPos; + + std::size_t myNextStyleInfoIndex; + std::size_t myNextCharInfoIndex; + std::size_t myNextBookmarkIndex; + std::size_t myNextInlineImageInfoIndex; + std::size_t myNextFloatImageInfoIndex; +}; + +#endif /* __OLESTREAMPARSER_H__ */ diff --git a/fbreader/src/formats/doc/OleStreamReader.cpp b/fbreader/src/formats/doc/OleStreamReader.cpp new file mode 100644 index 0000000..224489a --- /dev/null +++ b/fbreader/src/formats/doc/OleStreamReader.cpp @@ -0,0 +1,86 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLLogger.h> + +#include "OleMainStream.h" +#include "OleUtil.h" +#include "OleStreamReader.h" + +OleStreamReader::OleStreamReader() : myNextPieceNumber(0) { +} + +bool OleStreamReader::readDocument(shared_ptr<ZLInputStream> inputStream, bool doReadFormattingData) { + static const std::string WORD_DOCUMENT = "WordDocument"; + + shared_ptr<OleStorage> storage = new OleStorage; + + if (!storage->init(inputStream, inputStream->sizeOfOpened())) { + ZLLogger::Instance().println("DocPlugin", "Broken OLE file"); + return false; + } + + OleEntry wordDocumentEntry; + if (!storage->getEntryByName(WORD_DOCUMENT, wordDocumentEntry)) { + return false; + } + + OleMainStream oleStream(storage, wordDocumentEntry, inputStream); + if (!oleStream.open(doReadFormattingData)) { + ZLLogger::Instance().println("DocPlugin", "Cannot open OleMainStream"); + return false; + } + return readStream(oleStream); +} + +bool OleStreamReader::readNextPiece(OleMainStream &stream) { + const OleMainStream::Pieces &pieces = stream.getPieces(); + if (myNextPieceNumber >= pieces.size()) { + return false; + } + const OleMainStream::Piece &piece = pieces.at(myNextPieceNumber); + + if (piece.Type == OleMainStream::Piece::PIECE_FOOTNOTE) { + footnotesStartHandler(); + } else if (piece.Type == OleMainStream::Piece::PIECE_OTHER) { + return false; + } + + if (!stream.seek(piece.Offset, true)) { + //TODO maybe in that case we should take next piece? + return false; + } + char *textBuffer = new char[piece.Length]; + std::size_t readBytes = stream.read(textBuffer, piece.Length); + if (readBytes != (std::size_t)piece.Length) { + ZLLogger::Instance().println("DocPlugin", "not all bytes have been read from piece"); + } + + if (!piece.IsANSI) { + for (std::size_t i = 0; i < readBytes; i += 2) { + ucs2SymbolHandler(OleUtil::getU2Bytes(textBuffer, i)); + } + } else { + ansiDataHandler(textBuffer, readBytes); + } + ++myNextPieceNumber; + delete[] textBuffer; + + return true; +} diff --git a/fbreader/src/formats/doc/OleStreamReader.h b/fbreader/src/formats/doc/OleStreamReader.h new file mode 100644 index 0000000..2d2a0ae --- /dev/null +++ b/fbreader/src/formats/doc/OleStreamReader.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __OLESTREAMREADER_H__ +#define __OLESTREAMREADER_H__ + +#include <ZLUnicodeUtil.h> + +#include "OleMainStream.h" + +class OleStreamReader { + +public: + OleStreamReader(); + bool readDocument(shared_ptr<ZLInputStream> stream, bool doReadFormattingData); + +protected: + virtual bool readStream(OleMainStream &stream) = 0; + + bool readNextPiece(OleMainStream &stream); + + virtual void ansiDataHandler(const char *buffer, std::size_t len) = 0; + virtual void ucs2SymbolHandler(ZLUnicodeUtil::Ucs2Char symbol) = 0; + virtual void footnotesStartHandler() = 0; + +private: + std::size_t myNextPieceNumber; +}; + +#endif /* __OLESTREAMREADER_H__ */ diff --git a/fbreader/src/formats/doc/OleUtil.cpp b/fbreader/src/formats/doc/OleUtil.cpp new file mode 100644 index 0000000..2e8f685 --- /dev/null +++ b/fbreader/src/formats/doc/OleUtil.cpp @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include "OleUtil.h" + +int OleUtil::get4Bytes(const char *buffer, unsigned int offset) { + const unsigned char *buf = (const unsigned char*)buffer; + return + (int)buf[offset] + | ((int)buf[offset+1] << 8) + | ((int)buf[offset+2] << 16) + | ((int)buf[offset+3] << 24); +} + +unsigned int OleUtil::getU4Bytes(const char *buffer, unsigned int offset) { + const unsigned char *buf = (const unsigned char*)buffer; + return + (unsigned int)buf[offset] + | ((unsigned int)buf[offset+1] << 8) + | ((unsigned int)buf[offset+2] << 16) + | ((unsigned int)buf[offset+3] << 24); +} + +unsigned int OleUtil::getU2Bytes(const char *buffer, unsigned int offset) { + const unsigned char *buf = (const unsigned char*)buffer; + return + (unsigned int)buf[offset] + | ((unsigned int)buf[offset+1] << 8); +} + +unsigned int OleUtil::getU1Byte(const char *buffer, unsigned int offset) { + const unsigned char *buf = (const unsigned char*)buffer; + return (unsigned int)buf[offset]; +} + +int OleUtil::get1Byte(const char *buffer, unsigned int offset) { + const unsigned char *buf = (const unsigned char*)buffer; + return (int)buf[offset]; +} + + + diff --git a/fbreader/src/formats/doc/OleUtil.h b/fbreader/src/formats/doc/OleUtil.h new file mode 100644 index 0000000..531c769 --- /dev/null +++ b/fbreader/src/formats/doc/OleUtil.h @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <contact@geometerplus.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __OLEUTIL_H__ +#define __OLEUTIL_H__ + +class OleUtil { +public: + static int get4Bytes(const char *buffer, unsigned int offset); + static unsigned int getU4Bytes(const char *buffer, unsigned int offset); + static unsigned int getU2Bytes(const char *buffer, unsigned int offset); + static unsigned int getU1Byte(const char *buffer, unsigned int offset); + static int get1Byte(const char *buffer, unsigned int offset); +}; + +#endif /* __OLEUTIL_H__ */ |