7 * This source file is subject to the new BSD license that is bundled
8 * with this package in the file LICENSE.txt.
9 * It is also available through the world-wide-web at this URL:
10 * http://framework.zend.com/license/new-bsd
11 * If you did not receive a copy of the license and are unable to
12 * obtain it through the world-wide-web, please send an email
13 * to license@zend.com so we can send you a copy immediately.
16 * @package Zend_Search_Lucene
17 * @subpackage Document
18 * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
19 * @license http://framework.zend.com/license/new-bsd New BSD License
20 * @version $Id: Xlsx.php 16971 2009-07-22 18:05:45Z mikaelkael $
24 /** Zend_Search_Lucene_Document_OpenXml */
25 require_once 'Zend/Search/Lucene/Document/OpenXml.php';
27 if (class_exists('ZipArchive', false)) {
33 * @package Zend_Search_Lucene
34 * @subpackage Document
35 * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
36 * @license http://framework.zend.com/license/new-bsd New BSD License
38 class Zend_Search_Lucene_Document_Xlsx extends Zend_Search_Lucene_Document_OpenXml
41 * Xml Schema - SpreadsheetML
45 const SCHEMA_SPREADSHEETML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main';
48 * Xml Schema - DrawingML
52 const SCHEMA_DRAWINGML = 'http://schemas.openxmlformats.org/drawingml/2006/main';
55 * Xml Schema - Shared Strings
59 const SCHEMA_SHAREDSTRINGS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings';
62 * Xml Schema - Worksheet relation
66 const SCHEMA_WORKSHEETRELATION = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet';
69 * Xml Schema - Slide notes relation
73 const SCHEMA_SLIDENOTESRELATION = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide';
78 * @param string $fileName
79 * @param boolean $storeContent
81 private function __construct($fileName, $storeContent)
83 // Document data holders
84 $sharedStrings = array();
85 $worksheets = array();
86 $documentBody = array();
87 $coreProperties = array();
89 // Open OpenXML package
90 $package = new ZipArchive();
91 $package->open($fileName);
93 // Read relations and search for officeDocument
94 $relations = simplexml_load_string($package->getFromName("_rels/.rels"));
95 foreach ($relations->Relationship as $rel) {
96 if ($rel["Type"] == Zend_Search_Lucene_Document_OpenXml::SCHEMA_OFFICEDOCUMENT) {
97 // Found office document! Read relations for workbook...
98 $workbookRelations = simplexml_load_string($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/_rels/" . basename($rel["Target"]) . ".rels")) );
99 $workbookRelations->registerXPathNamespace("rel", Zend_Search_Lucene_Document_OpenXml::SCHEMA_RELATIONSHIP);
101 // Read shared strings
102 $sharedStringsPath = $workbookRelations->xpath("rel:Relationship[@Type='" . Zend_Search_Lucene_Document_Xlsx::SCHEMA_SHAREDSTRINGS . "']");
103 $sharedStringsPath = (string)$sharedStringsPath[0]['Target'];
104 $xmlStrings = simplexml_load_string($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . $sharedStringsPath)) );
105 if (isset($xmlStrings) && isset($xmlStrings->si)) {
106 foreach ($xmlStrings->si as $val) {
107 if (isset($val->t)) {
108 $sharedStrings[] = (string)$val->t;
109 } elseif (isset($val->r)) {
110 $sharedStrings[] = $this->_parseRichText($val);
115 // Loop relations for workbook and extract worksheets...
116 foreach ($workbookRelations->Relationship as $workbookRelation) {
117 if ($workbookRelation["Type"] == Zend_Search_Lucene_Document_Xlsx::SCHEMA_WORKSHEETRELATION) {
118 $worksheets[ str_replace( 'rId', '', (string)$workbookRelation["Id"]) ] = simplexml_load_string(
119 $package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($workbookRelation["Target"]) . "/" . basename($workbookRelation["Target"])) )
131 // Extract contents from worksheets
132 foreach ($worksheets as $sheetKey => $worksheet) {
133 foreach ($worksheet->sheetData->row as $row) {
134 foreach ($row->c as $c) {
135 // Determine data type
136 $dataType = (string)$c["t"];
139 // Value is a shared string
140 if ((string)$c->v != '') {
141 $value = $sharedStrings[intval($c->v)];
150 $value = (string)$c->v;
153 } else if ($value == '1') {
156 $value = (bool)$c->v;
162 // Value is rich text inline
163 $value = $this->_parseRichText($c->is);
168 // Value is an error message
169 if ((string)$c->v != '') {
170 $value = (string)$c->v;
179 $value = (string)$c->v;
181 // Check for numeric values
182 if (is_numeric($value) && $dataType != 's') {
183 if ($value == (int)$value) $value = (int)$value;
184 elseif ($value == (float)$value) $value = (float)$value;
185 elseif ($value == (double)$value) $value = (double)$value;
189 $documentBody[] = $value;
194 // Read core properties
195 $coreProperties = $this->extractMetaData($package);
201 $this->addField(Zend_Search_Lucene_Field::Text('filename', $fileName, 'UTF-8'));
205 $this->addField(Zend_Search_Lucene_Field::Text('body', implode(' ', $documentBody), 'UTF-8'));
207 $this->addField(Zend_Search_Lucene_Field::UnStored('body', implode(' ', $documentBody), 'UTF-8'));
210 // Store meta data properties
211 foreach ($coreProperties as $key => $value)
213 $this->addField(Zend_Search_Lucene_Field::Text($key, $value, 'UTF-8'));
216 // Store title (if not present in meta data)
217 if (!isset($coreProperties['title']))
219 $this->addField(Zend_Search_Lucene_Field::Text('title', $fileName, 'UTF-8'));
224 * Parse rich text XML
226 * @param SimpleXMLElement $is
229 private function _parseRichText($is = null) {
233 $value[] = (string)$is->t;
235 foreach ($is->r as $run) {
236 $value[] = (string)$run->t;
240 return implode('', $value);
244 * Load Xlsx document from a file
246 * @param string $fileName
247 * @param boolean $storeContent
248 * @return Zend_Search_Lucene_Document_Xlsx
250 public static function loadXlsxFile($fileName, $storeContent = false)
252 return new Zend_Search_Lucene_Document_Xlsx($fileName, $storeContent);
256 } // end if (class_exists('ZipArchive'))