Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
93.25% covered (success)
93.25%
152 / 163
91.67% covered (success)
91.67%
11 / 12
CRAP
0.00% covered (danger)
0.00%
0 / 1
Parser
93.25% covered (success)
93.25%
152 / 163
91.67% covered (success)
91.67%
11 / 12
63.18
0.00% covered (danger)
0.00%
0 / 1
 getObjectStreams
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getObjectMap
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getFonts
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 parseFile
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 parseData
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 parse
100.00% covered (success)
100.00%
25 / 25
100.00% covered (success)
100.00%
1 / 1
15
 initFile
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
2
 initData
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
1
 mapObjects
100.00% covered (success)
100.00%
62 / 62
100.00% covered (success)
100.00%
1 / 1
9
 mapFonts
69.44% covered (warning)
69.44%
25 / 36
0.00% covered (danger)
0.00%
0 / 1
23.30
 filterPages
100.00% covered (success)
100.00%
12 / 12
100.00% covered (success)
100.00%
1 / 1
7
 getObjects
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
7
1<?php
2/**
3 * Pop PHP Framework (http://www.popphp.org/)
4 *
5 * @link       https://github.com/popphp/popphp-framework
6 * @author     Nick Sagona, III <dev@nolainteractive.com>
7 * @copyright  Copyright (c) 2009-2024 NOLA Interactive, LLC. (http://www.nolainteractive.com)
8 * @license    http://www.popphp.org/license     New BSD License
9 */
10
11/**
12 * @namespace
13 */
14namespace Pop\Pdf\Build;
15
16use Pop\Pdf\Document\AbstractDocument;
17
18/**
19 * Pdf parser class
20 *
21 * @category   Pop
22 * @package    Pop\Pdf
23 * @author     Nick Sagona, III <dev@nolainteractive.com>
24 * @copyright  Copyright (c) 2009-2024 NOLA Interactive, LLC. (http://www.nolainteractive.com)
25 * @license    http://www.popphp.org/license     New BSD License
26 * @version    5.0.0
27 */
28class Parser extends AbstractParser
29{
30
31    /**
32     * Parsed object data streams
33     * @var array
34     */
35    protected array $objectStreams = [];
36
37    /**
38     * Object map
39     * @var array
40     */
41    protected array $objectMap = [];
42
43    /**
44     * Document fonts
45     * @var array
46     */
47    protected array $fonts = [];
48
49    /**
50     * Get the object streams
51     *
52     * @return array
53     */
54    public function getObjectStreams(): array
55    {
56        return $this->objectStreams;
57    }
58
59    /**
60     * Get the object map
61     *
62     * @return array
63     */
64    public function getObjectMap(): array
65    {
66        return $this->objectMap;
67    }
68
69    /**
70     * Get the document fonts
71     *
72     * @return array
73     */
74    public function getFonts(): array
75    {
76        return $this->fonts;
77    }
78
79    /**
80     * Parse from file
81     *
82     * @param  string $file
83     * @param  mixed  $pages
84     * @throws Exception
85     * @return AbstractDocument
86     */
87    public function parseFile(string $file, mixed $pages = null): AbstractDocument
88    {
89        $this->initFile($file);
90        return $this->parse($pages);
91    }
92
93    /**
94     * Parse from raw data stream
95     *
96     * @param  string $data
97     * @param  mixed  $pages
98     * @throws Exception
99     * @return AbstractDocument
100     */
101    public function parseData(string $data, mixed $pages = null): AbstractDocument
102    {
103        $this->initData($data);
104        return $this->parse($pages);
105    }
106
107    /**
108     * Parse the data stream
109     *
110     * @param  mixed  $pages
111     * @return AbstractDocument
112     */
113    public function parse(mixed $pages = null): AbstractDocument
114    {
115        $matches = [];
116        preg_match_all('/\d*\s\d*\sobj(.*?)endobj/sm', $this->data, $matches, PREG_OFFSET_CAPTURE);
117
118        if (isset($matches[0]) && isset($matches[0][0])) {
119            foreach ($matches[0] as $match) {
120                if ((!str_contains($match[0], '/Linearized')) && (!str_contains($match[0], '/Type/Metadata'))) {
121                    $this->objectStreams[] = $match[0];
122                }
123            }
124        }
125
126        // Map the objects by parsing the object streams
127        $this->mapObjects();
128
129        if (isset($this->objectMap['pages'])) {
130            // Map fonts, if any
131            if (isset($this->objectMap['streams'])) {
132                $this->mapFonts();
133            }
134            // If certain pages are to be imported, filter out the unwanted pages
135            if ($pages !== null) {
136                $this->filterPages($pages);
137            }
138        }
139
140        $doc = new \Pop\Pdf\Document();
141
142        if (isset($this->objectMap['root']) && isset($this->objectMap['root']['object'])) {
143            $doc->setVersion($this->objectMap['root']['object']->getVersion());
144        }
145        if (isset($this->objectMap['info']) && isset($this->objectMap['info']['object'])) {
146            $doc->setMetadata($this->objectMap['info']['object']->getMetadata());
147        }
148
149        $doc->importObjects($this->getObjects());
150        $doc->importFonts($this->getFonts());
151
152        if (isset($this->objectMap['pages'])) {
153            foreach ($this->objectMap['pages'] as $i => $page) {
154                $pg = new \Pop\Pdf\Document\Page($page['width'], $page['height'], $i);
155                $pg->importPageObject($page['object']);
156                $doc->addPage($pg);
157            }
158        }
159
160        return $doc;
161    }
162
163    /**
164     * Initialize the file and get the data
165     *
166     * @param  string $file
167     * @throws Exception
168     * @return Parser
169     */
170    protected function initFile(string $file): Parser
171    {
172        if (!file_exists($file)) {
173            throw new Exception('Error: That PDF file does not exist.');
174        }
175
176        $this->file = $file;
177        $this->data = file_get_contents($this->file);
178
179        $this->objectStreams = [];
180        $this->objectMap     = [];
181        $this->fonts         = [];
182
183        return $this;
184    }
185
186    /**
187     * Initialize data
188     *
189     * @param  string $data
190     * @return Parser
191     */
192    protected function initData(string $data): Parser
193    {
194        $this->data = $data;
195
196        $this->objectStreams = [];
197        $this->objectMap     = [];
198        $this->fonts         = [];
199
200        return $this;
201    }
202
203    /**
204     * Map the objects
205     *
206     * @return void
207     */
208    protected function mapObjects(): void
209    {
210        foreach ($this->objectStreams as $stream) {
211            switch ($this->getStreamType($stream)) {
212                case 'root':
213                    $root = PdfObject\RootObject::parse($stream);
214                    $root->setImported(true);
215                    $root->setVersion(substr($this->data, 5, 3));
216                    $this->objectMap['root'] = [
217                        'stream' => $stream,
218                        'object' => $root,
219                        'index'  => $root->getIndex(),
220                        'parent' => $root->getParentIndex()
221                    ];
222                    break;
223                case 'parent':
224                    $parent = PdfObject\ParentObject::parse($stream);
225                    $parent->setImported(true);
226                    $this->objectMap['parent'] = [
227                        'stream' => $stream,
228                        'object' => $parent,
229                        'index'  => $parent->getIndex(),
230                        'count'  => $parent->getCount(),
231                        'kids'   => $parent->getKids()
232                    ];
233                    break;
234                case 'info':
235                    $info = PdfObject\InfoObject::parse($stream);
236                    $info->setImported(true);
237                    $this->objectMap['info'] = [
238                        'stream' => $stream,
239                        'object' => $info,
240                        'index'  => $info->getIndex(),
241                    ];
242                    break;
243                case 'page':
244                    if (!isset($this->objectMap['pages'])) {
245                        $this->objectMap['pages'] = [];
246                    }
247
248                    $page = PdfObject\PageObject::parse($stream);
249                    $page->setImported(true);
250
251                    $this->objectMap['pages'][$page->getIndex()] = [
252                        'stream'   => $stream,
253                        'object'   => $page,
254                        'index'    => $page->getIndex(),
255                        'parent'   => $page->getParentIndex(),
256                        'width'    => $page->getWidth(),
257                        'height'   => $page->getHeight(),
258                        'content'  => $page->getContent(),
259                        'annots'   => $page->getAnnots(),
260                        'fonts'    => $page->getFonts(),
261                        'xObjects' => $page->getXObjects()
262                    ];
263                    break;
264                case 'stream':
265                    if (!isset($this->objectMap['streams'])) {
266                        $this->objectMap['streams'] = [];
267                    }
268                    $stream = PdfObject\StreamObject::parse($stream);
269                    $stream->setImported(true);
270                    $this->objectMap['streams'][$stream->getIndex()] = [
271                        'stream' => $stream,
272                        'object' => $stream,
273                        'index'  => $stream->getIndex()
274                    ];
275                    break;
276            }
277        }
278    }
279
280    /**
281     * Map the fonts, if any
282     *
283     * @return void
284     */
285    protected function mapFonts(): void
286    {
287        foreach ($this->objectMap['pages'] as $page) {
288            if (isset($page['fonts']) && (count($page['fonts']) > 0)) {
289                foreach ($page['fonts'] as $i => $font) {
290                    if (str_contains($this->objectMap['streams'][$i]['stream'], '/BaseFont')) {
291                        $fontName = trim(
292                            substr(
293                                $this->objectMap['streams'][$i]['stream'],
294                                (strpos($this->objectMap['streams'][$i]['stream'], '/BaseFont') + 9)
295                            )
296                        );
297
298                        if (str_starts_with($fontName, '/')) {
299                            $fontName = substr($fontName, 1);
300                        }
301                        $fontName = ((str_contains($fontName, '/'))) ?
302                            substr($fontName, 0, strpos($fontName, '/')) :
303                            substr($fontName, 0, strpos($fontName, '>'));
304
305                        $f = [
306                            'name'  => trim($fontName),
307                            'index' => $i,
308                            'ref'   => $font
309                        ];
310
311                        if (!in_array($f, $this->fonts, true)) {
312                            $this->fonts[] = $f;
313                        }
314                    }
315                }
316            }
317        }
318
319        $fontFileObjects = [];
320        foreach ($this->objectStreams as $stream) {
321            if (str_contains($stream, '/FontFile')) {
322                $fontFileObject = substr($stream, strpos($stream, '/FontFile'));
323                $fontFileObject = substr($fontFileObject, (strpos($fontFileObject, ' ') + 1));
324                $fontFileObject = trim(substr($fontFileObject, 0, strpos($fontFileObject, '0 R')));
325                $fontFileObjects[] = $fontFileObject;
326            }
327        }
328
329        if (!empty($fontFileObjects)) {
330            foreach ($fontFileObjects as $fontFileObject) {
331                if (($fontFileObject == 13) && isset($this->objectMap['streams'][$fontFileObject])) {
332                    $fontFile = $this->objectMap['streams'][$fontFileObject];
333                    $contents = ($fontFile['object']->getEncoding() == 'FlateDecode') ?
334                        gzuncompress(trim($fontFile['object']->getStream())) : $fontFile['object']->getStream();
335
336                    $fontParser = new \Pop\Pdf\Build\Font\TrueType(null, $contents);
337                }
338            }
339        }
340    }
341
342    /**
343     * Filter pages
344     *
345     * @param  mixed $pages
346     * @return void
347     */
348    protected function filterPages(mixed $pages): void
349    {
350        $pages = (!is_array($pages)) ? [$pages] : $pages;
351        $kids = $this->objectMap['parent']['object']->getKids();
352        $keep = [];
353        foreach ($pages as $page) {
354            if (isset($kids[$page - 1])) {
355                $keep[] = $kids[$page - 1];
356            }
357        }
358
359        $this->objectMap['parent']['object']->setKids($keep);
360        $this->objectMap['parent']['count']  = count($keep);
361        $this->objectMap['parent']['kids']   = $keep;
362
363        foreach ($kids as $kid) {
364            if (!in_array($kid, $keep) && isset($this->objectMap['pages'][$kid])) {
365                unset($this->objectMap['pages'][$kid]);
366            }
367        }
368    }
369
370    /**
371     * Get the objects for import
372     *
373     * @return array
374     */
375    protected function getObjects(): array
376    {
377        $objects = [];
378        foreach ($this->objectMap as $type => $object) {
379            if (($type == 'root') || ($type == 'parent') || ($type == 'info')) {
380                $objects[$object['index']] = $object['object'];
381            } else if ($type == 'streams') {
382                foreach ($object as $obj) {
383                    $objects[$obj['index']] = $obj['stream'];
384                }
385            }
386        }
387
388        return $objects;
389    }
390
391}