Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
93.25% |
152 / 163 |
|
91.67% |
11 / 12 |
CRAP | |
0.00% |
0 / 1 |
Parser | |
93.25% |
152 / 163 |
|
91.67% |
11 / 12 |
63.18 | |
0.00% |
0 / 1 |
getObjectStreams | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getObjectMap | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getFonts | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
parseFile | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
parseData | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
parse | |
100.00% |
25 / 25 |
|
100.00% |
1 / 1 |
15 | |||
initFile | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
2 | |||
initData | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
mapObjects | |
100.00% |
62 / 62 |
|
100.00% |
1 / 1 |
9 | |||
mapFonts | |
69.44% |
25 / 36 |
|
0.00% |
0 / 1 |
23.30 | |||
filterPages | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
7 | |||
getObjects | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
7 |
1 | <?php |
2 | /** |
3 | * Pop PHP Framework (http://www.popphp.org/) |
4 | * |
5 | * @link https://github.com/popphp/popphp-framework |
6 | * @author Nick Sagona, III <dev@nolainteractive.com> |
7 | * @copyright Copyright (c) 2009-2024 NOLA Interactive, LLC. (http://www.nolainteractive.com) |
8 | * @license http://www.popphp.org/license New BSD License |
9 | */ |
10 | |
11 | /** |
12 | * @namespace |
13 | */ |
14 | namespace Pop\Pdf\Build; |
15 | |
16 | use Pop\Pdf\Document\AbstractDocument; |
17 | |
18 | /** |
19 | * Pdf parser class |
20 | * |
21 | * @category Pop |
22 | * @package Pop\Pdf |
23 | * @author Nick Sagona, III <dev@nolainteractive.com> |
24 | * @copyright Copyright (c) 2009-2024 NOLA Interactive, LLC. (http://www.nolainteractive.com) |
25 | * @license http://www.popphp.org/license New BSD License |
26 | * @version 5.0.0 |
27 | */ |
28 | class Parser extends AbstractParser |
29 | { |
30 | |
31 | /** |
32 | * Parsed object data streams |
33 | * @var array |
34 | */ |
35 | protected array $objectStreams = []; |
36 | |
37 | /** |
38 | * Object map |
39 | * @var array |
40 | */ |
41 | protected array $objectMap = []; |
42 | |
43 | /** |
44 | * Document fonts |
45 | * @var array |
46 | */ |
47 | protected array $fonts = []; |
48 | |
49 | /** |
50 | * Get the object streams |
51 | * |
52 | * @return array |
53 | */ |
54 | public function getObjectStreams(): array |
55 | { |
56 | return $this->objectStreams; |
57 | } |
58 | |
59 | /** |
60 | * Get the object map |
61 | * |
62 | * @return array |
63 | */ |
64 | public function getObjectMap(): array |
65 | { |
66 | return $this->objectMap; |
67 | } |
68 | |
69 | /** |
70 | * Get the document fonts |
71 | * |
72 | * @return array |
73 | */ |
74 | public function getFonts(): array |
75 | { |
76 | return $this->fonts; |
77 | } |
78 | |
79 | /** |
80 | * Parse from file |
81 | * |
82 | * @param string $file |
83 | * @param mixed $pages |
84 | * @throws Exception |
85 | * @return AbstractDocument |
86 | */ |
87 | public function parseFile(string $file, mixed $pages = null): AbstractDocument |
88 | { |
89 | $this->initFile($file); |
90 | return $this->parse($pages); |
91 | } |
92 | |
93 | /** |
94 | * Parse from raw data stream |
95 | * |
96 | * @param string $data |
97 | * @param mixed $pages |
98 | * @throws Exception |
99 | * @return AbstractDocument |
100 | */ |
101 | public function parseData(string $data, mixed $pages = null): AbstractDocument |
102 | { |
103 | $this->initData($data); |
104 | return $this->parse($pages); |
105 | } |
106 | |
107 | /** |
108 | * Parse the data stream |
109 | * |
110 | * @param mixed $pages |
111 | * @return AbstractDocument |
112 | */ |
113 | public function parse(mixed $pages = null): AbstractDocument |
114 | { |
115 | $matches = []; |
116 | preg_match_all('/\d*\s\d*\sobj(.*?)endobj/sm', $this->data, $matches, PREG_OFFSET_CAPTURE); |
117 | |
118 | if (isset($matches[0]) && isset($matches[0][0])) { |
119 | foreach ($matches[0] as $match) { |
120 | if ((!str_contains($match[0], '/Linearized')) && (!str_contains($match[0], '/Type/Metadata'))) { |
121 | $this->objectStreams[] = $match[0]; |
122 | } |
123 | } |
124 | } |
125 | |
126 | // Map the objects by parsing the object streams |
127 | $this->mapObjects(); |
128 | |
129 | if (isset($this->objectMap['pages'])) { |
130 | // Map fonts, if any |
131 | if (isset($this->objectMap['streams'])) { |
132 | $this->mapFonts(); |
133 | } |
134 | // If certain pages are to be imported, filter out the unwanted pages |
135 | if ($pages !== null) { |
136 | $this->filterPages($pages); |
137 | } |
138 | } |
139 | |
140 | $doc = new \Pop\Pdf\Document(); |
141 | |
142 | if (isset($this->objectMap['root']) && isset($this->objectMap['root']['object'])) { |
143 | $doc->setVersion($this->objectMap['root']['object']->getVersion()); |
144 | } |
145 | if (isset($this->objectMap['info']) && isset($this->objectMap['info']['object'])) { |
146 | $doc->setMetadata($this->objectMap['info']['object']->getMetadata()); |
147 | } |
148 | |
149 | $doc->importObjects($this->getObjects()); |
150 | $doc->importFonts($this->getFonts()); |
151 | |
152 | if (isset($this->objectMap['pages'])) { |
153 | foreach ($this->objectMap['pages'] as $i => $page) { |
154 | $pg = new \Pop\Pdf\Document\Page($page['width'], $page['height'], $i); |
155 | $pg->importPageObject($page['object']); |
156 | $doc->addPage($pg); |
157 | } |
158 | } |
159 | |
160 | return $doc; |
161 | } |
162 | |
163 | /** |
164 | * Initialize the file and get the data |
165 | * |
166 | * @param string $file |
167 | * @throws Exception |
168 | * @return Parser |
169 | */ |
170 | protected function initFile(string $file): Parser |
171 | { |
172 | if (!file_exists($file)) { |
173 | throw new Exception('Error: That PDF file does not exist.'); |
174 | } |
175 | |
176 | $this->file = $file; |
177 | $this->data = file_get_contents($this->file); |
178 | |
179 | $this->objectStreams = []; |
180 | $this->objectMap = []; |
181 | $this->fonts = []; |
182 | |
183 | return $this; |
184 | } |
185 | |
186 | /** |
187 | * Initialize data |
188 | * |
189 | * @param string $data |
190 | * @return Parser |
191 | */ |
192 | protected function initData(string $data): Parser |
193 | { |
194 | $this->data = $data; |
195 | |
196 | $this->objectStreams = []; |
197 | $this->objectMap = []; |
198 | $this->fonts = []; |
199 | |
200 | return $this; |
201 | } |
202 | |
203 | /** |
204 | * Map the objects |
205 | * |
206 | * @return void |
207 | */ |
208 | protected function mapObjects(): void |
209 | { |
210 | foreach ($this->objectStreams as $stream) { |
211 | switch ($this->getStreamType($stream)) { |
212 | case 'root': |
213 | $root = PdfObject\RootObject::parse($stream); |
214 | $root->setImported(true); |
215 | $root->setVersion(substr($this->data, 5, 3)); |
216 | $this->objectMap['root'] = [ |
217 | 'stream' => $stream, |
218 | 'object' => $root, |
219 | 'index' => $root->getIndex(), |
220 | 'parent' => $root->getParentIndex() |
221 | ]; |
222 | break; |
223 | case 'parent': |
224 | $parent = PdfObject\ParentObject::parse($stream); |
225 | $parent->setImported(true); |
226 | $this->objectMap['parent'] = [ |
227 | 'stream' => $stream, |
228 | 'object' => $parent, |
229 | 'index' => $parent->getIndex(), |
230 | 'count' => $parent->getCount(), |
231 | 'kids' => $parent->getKids() |
232 | ]; |
233 | break; |
234 | case 'info': |
235 | $info = PdfObject\InfoObject::parse($stream); |
236 | $info->setImported(true); |
237 | $this->objectMap['info'] = [ |
238 | 'stream' => $stream, |
239 | 'object' => $info, |
240 | 'index' => $info->getIndex(), |
241 | ]; |
242 | break; |
243 | case 'page': |
244 | if (!isset($this->objectMap['pages'])) { |
245 | $this->objectMap['pages'] = []; |
246 | } |
247 | |
248 | $page = PdfObject\PageObject::parse($stream); |
249 | $page->setImported(true); |
250 | |
251 | $this->objectMap['pages'][$page->getIndex()] = [ |
252 | 'stream' => $stream, |
253 | 'object' => $page, |
254 | 'index' => $page->getIndex(), |
255 | 'parent' => $page->getParentIndex(), |
256 | 'width' => $page->getWidth(), |
257 | 'height' => $page->getHeight(), |
258 | 'content' => $page->getContent(), |
259 | 'annots' => $page->getAnnots(), |
260 | 'fonts' => $page->getFonts(), |
261 | 'xObjects' => $page->getXObjects() |
262 | ]; |
263 | break; |
264 | case 'stream': |
265 | if (!isset($this->objectMap['streams'])) { |
266 | $this->objectMap['streams'] = []; |
267 | } |
268 | $stream = PdfObject\StreamObject::parse($stream); |
269 | $stream->setImported(true); |
270 | $this->objectMap['streams'][$stream->getIndex()] = [ |
271 | 'stream' => $stream, |
272 | 'object' => $stream, |
273 | 'index' => $stream->getIndex() |
274 | ]; |
275 | break; |
276 | } |
277 | } |
278 | } |
279 | |
280 | /** |
281 | * Map the fonts, if any |
282 | * |
283 | * @return void |
284 | */ |
285 | protected function mapFonts(): void |
286 | { |
287 | foreach ($this->objectMap['pages'] as $page) { |
288 | if (isset($page['fonts']) && (count($page['fonts']) > 0)) { |
289 | foreach ($page['fonts'] as $i => $font) { |
290 | if (str_contains($this->objectMap['streams'][$i]['stream'], '/BaseFont')) { |
291 | $fontName = trim( |
292 | substr( |
293 | $this->objectMap['streams'][$i]['stream'], |
294 | (strpos($this->objectMap['streams'][$i]['stream'], '/BaseFont') + 9) |
295 | ) |
296 | ); |
297 | |
298 | if (str_starts_with($fontName, '/')) { |
299 | $fontName = substr($fontName, 1); |
300 | } |
301 | $fontName = ((str_contains($fontName, '/'))) ? |
302 | substr($fontName, 0, strpos($fontName, '/')) : |
303 | substr($fontName, 0, strpos($fontName, '>')); |
304 | |
305 | $f = [ |
306 | 'name' => trim($fontName), |
307 | 'index' => $i, |
308 | 'ref' => $font |
309 | ]; |
310 | |
311 | if (!in_array($f, $this->fonts, true)) { |
312 | $this->fonts[] = $f; |
313 | } |
314 | } |
315 | } |
316 | } |
317 | } |
318 | |
319 | $fontFileObjects = []; |
320 | foreach ($this->objectStreams as $stream) { |
321 | if (str_contains($stream, '/FontFile')) { |
322 | $fontFileObject = substr($stream, strpos($stream, '/FontFile')); |
323 | $fontFileObject = substr($fontFileObject, (strpos($fontFileObject, ' ') + 1)); |
324 | $fontFileObject = trim(substr($fontFileObject, 0, strpos($fontFileObject, '0 R'))); |
325 | $fontFileObjects[] = $fontFileObject; |
326 | } |
327 | } |
328 | |
329 | if (!empty($fontFileObjects)) { |
330 | foreach ($fontFileObjects as $fontFileObject) { |
331 | if (($fontFileObject == 13) && isset($this->objectMap['streams'][$fontFileObject])) { |
332 | $fontFile = $this->objectMap['streams'][$fontFileObject]; |
333 | $contents = ($fontFile['object']->getEncoding() == 'FlateDecode') ? |
334 | gzuncompress(trim($fontFile['object']->getStream())) : $fontFile['object']->getStream(); |
335 | |
336 | $fontParser = new \Pop\Pdf\Build\Font\TrueType(null, $contents); |
337 | } |
338 | } |
339 | } |
340 | } |
341 | |
342 | /** |
343 | * Filter pages |
344 | * |
345 | * @param mixed $pages |
346 | * @return void |
347 | */ |
348 | protected function filterPages(mixed $pages): void |
349 | { |
350 | $pages = (!is_array($pages)) ? [$pages] : $pages; |
351 | $kids = $this->objectMap['parent']['object']->getKids(); |
352 | $keep = []; |
353 | foreach ($pages as $page) { |
354 | if (isset($kids[$page - 1])) { |
355 | $keep[] = $kids[$page - 1]; |
356 | } |
357 | } |
358 | |
359 | $this->objectMap['parent']['object']->setKids($keep); |
360 | $this->objectMap['parent']['count'] = count($keep); |
361 | $this->objectMap['parent']['kids'] = $keep; |
362 | |
363 | foreach ($kids as $kid) { |
364 | if (!in_array($kid, $keep) && isset($this->objectMap['pages'][$kid])) { |
365 | unset($this->objectMap['pages'][$kid]); |
366 | } |
367 | } |
368 | } |
369 | |
370 | /** |
371 | * Get the objects for import |
372 | * |
373 | * @return array |
374 | */ |
375 | protected function getObjects(): array |
376 | { |
377 | $objects = []; |
378 | foreach ($this->objectMap as $type => $object) { |
379 | if (($type == 'root') || ($type == 'parent') || ($type == 'info')) { |
380 | $objects[$object['index']] = $object['object']; |
381 | } else if ($type == 'streams') { |
382 | foreach ($object as $obj) { |
383 | $objects[$obj['index']] = $obj['stream']; |
384 | } |
385 | } |
386 | } |
387 | |
388 | return $objects; |
389 | } |
390 | |
391 | } |