corinthia-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From g..@apache.org
Subject [3/3] incubator-corinthia git commit: Produce a list of html nodes.
Date Sun, 10 May 2015 01:13:26 GMT
Produce a list of html nodes.

This is the current blueprint I hope to use and I think is ready to be
looked at to check this approach for usefulness.

I can and probably will add function pointers to struct
ODF_to_HTML_key where needed.

What works: This produces a list of HTML nodes.  H1..H6 tags are
correctly reported and corresponding nodes are generated.  Missing
TEXT_H entries also are correctly reported.  Text nodes are created.

To test, from the build directory:

~/odf-branch/incubator-corinthia/build>
$ ./bin/dfconvert get ../gbg_samples/headers.odt foo.html;

* gbg_test_output.txt: Output of test above for convenience.

* DocFormats/filters/odf/src/text/ODFText.c

  (ODFTextGet): add full report for newly generated htmlNode list.

  (traverseContent): plug in the traversal algorithm.

* DocFormats/filters/odf/src/text/gbg_test.c

  (struct ODF_to_HTML_key): Move declaration to gbg_test.h.

  (report_tags_found): Minor changes.

  (listODF_keys): New analysis function to only print out the current
    mappings.

  (locate_HTML): Remove magic number.

  (show_node): New function.  Traverse a list of nodes and print every
    node out.

  (printNode): Functions now prints a full DFNode profile (where
    applicable).

  (printMissingTag): New function.  Create string to alert to broken
    nodes.

* DocFormats/filters/odf/src/text/gbg_test.h
  Add prototypes for the new functions mentioned in gbg_test.c.

  (translateXMLEnumName): Add 10 dummy entries so the magic number
    '10' can be removed from code.

* gbg_samples: New directory with debugging odt files.

* gbg_samples/headers.odt:  odt files containing all ten possible header tags.


Project: http://git-wip-us.apache.org/repos/asf/incubator-corinthia/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-corinthia/commit/c81e6862
Tree: http://git-wip-us.apache.org/repos/asf/incubator-corinthia/tree/c81e6862
Diff: http://git-wip-us.apache.org/repos/asf/incubator-corinthia/diff/c81e6862

Branch: refs/heads/odf-filter-attempt2
Commit: c81e68626489b9515e7e8f3a5ce5d38ac8f59af0
Parents: 88d7f15
Author: Gabriela Gibson <gbg@apache.org>
Authored: Sun May 10 02:14:03 2015 +0100
Committer: Gabriela Gibson <gbg@apache.org>
Committed: Sun May 10 02:14:03 2015 +0100

----------------------------------------------------------------------
 DocFormats/filters/odf/src/text/ODFText.c  |   48 +-
 DocFormats/filters/odf/src/text/gbg_test.c |   99 +-
 DocFormats/filters/odf/src/text/gbg_test.h | 6719 ++++++++++++-----------
 gbg_samples/headers.odt                    |  Bin 0 -> 9780 bytes
 gbg_test_output.txt                        |  427 ++
 5 files changed, 3895 insertions(+), 3398 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-corinthia/blob/c81e6862/DocFormats/filters/odf/src/text/ODFText.c
----------------------------------------------------------------------
diff --git a/DocFormats/filters/odf/src/text/ODFText.c b/DocFormats/filters/odf/src/text/ODFText.c
index de1baed..3025f0b 100644
--- a/DocFormats/filters/odf/src/text/ODFText.c
+++ b/DocFormats/filters/odf/src/text/ODFText.c
@@ -34,24 +34,33 @@ typedef struct {
     DFHashTable *htmlIdByNumId;
 } ODFPutData;
 
-
 static void traverseContent(ODFTextConverter *conv, DFNode *odfNode, DFNode *htmlNode)
 {
-    for (DFNode *odfChild = odfNode->first; odfChild != NULL; odfChild = odfChild->next)
-        {
-            // printNode(odfChild);
-            if (odfChild->tag == 2) {
-                // we have some text here.
+    DFNode *child;
+
+    for (DFNode *odfChild = odfNode->first; odfChild != NULL; odfChild = odfChild->next)
{
+        if (odfChild->tag == 2) { // we have some text here.
+            child = DFCreateChildElement(htmlNode, odfChild->tag);
+            child->value = xstrdup(odfChild->value);
+        }
+        else {
+            Tag newTag = locate_HTML(odfChild);
+            if (newTag) {  // we find an already mapped ODF -> HTML tag
+                child = DFCreateChildElement(htmlNode, newTag);
             }
-            else {
-                Tag newTag = locate_HTML(odfChild);
+            else {  // We found a missing tag
+                child = DFCreateChildElement(htmlNode, 2);
+                child->value = printMissingTag(odfChild->tag);
+                if (odfChild->attrs)
+                    DFSetAttribute(child, odfChild->attrs->tag, odfChild->attrs->value);
             }
-            traverseContent(conv,odfChild,htmlNode);
         }
-        // TODO: Add a switch statement here to check the type of ODF element, and use
-        // DFCreateChildElement to create a new element in the HTML document as a child of
htmlNode
-        // based on the type. As this function gets more complicated, it will likely be useful
to
-        // split it up into several functions
+        traverseContent(conv,odfChild,htmlNode);
+    }
+    // TODO: Add a switch statement here to check the type of ODF element, and use
+    // DFCreateChildElement to create a new element in the HTML document as a child of htmlNode
+    // based on the type. As this function gets more complicated, it will likely be useful
to
+    // split it up into several functions
 }
 
 DFDocument *ODFTextGet(DFStorage *concreteStorage, DFStorage *abstractStorage, const char
*idPrefix, DFError **error)
@@ -74,12 +83,19 @@ DFDocument *ODFTextGet(DFStorage *concreteStorage, DFStorage *abstractStorage,
c
     // contentDoc is loaded from content.xml, and represents the most important information
in
     // the document, i.e. the text, tables, lists, etc.
     tagSeen = " ";
-    // Tag newTag = locate_HTML(package->contentDoc->root);
+
     traverseContent(conv, package->contentDoc->root, body);
-    
-    if (REPORT_TAG_FOUND)
+
+    if (REPORT_TAG_FOUND) 
         free(tagSeen);
 
+    printf("============================================================\n"
+           "Showing the result of the traverseContent function\n"
+           "============================================================\n"
+           );
+    show_nodes(body);
+    
+
     // TODO: Once this basic traversal is implemented and is capable of producing paragraphs,
     // tables, lists, and spans, add ids to the HTML elements as they are created. That is,
set
     // the id attribute of each new HTML element to a string containing the idPrefix followed
by

http://git-wip-us.apache.org/repos/asf/incubator-corinthia/blob/c81e6862/DocFormats/filters/odf/src/text/gbg_test.c
----------------------------------------------------------------------
diff --git a/DocFormats/filters/odf/src/text/gbg_test.c b/DocFormats/filters/odf/src/text/gbg_test.c
index f56c14f..ea7b92f 100644
--- a/DocFormats/filters/odf/src/text/gbg_test.c
+++ b/DocFormats/filters/odf/src/text/gbg_test.c
@@ -9,18 +9,6 @@
 #include "DFXMLNames.h"
 #include "gbg_test.h"
 
-void printNode(DFNode *node);  // temp func, just for some convenience
-/// Helper functions
-
-
-// it may be that the Tag Attribute is not needed.
-typedef struct {
-    Tag ODF_KEY;
-    Tag HTML_KEY;
-    Tag attribute;
-    char *attribute_value;
-} ODF_to_HTML_key;
-
 #define ENDMARKER 555555
 ODF_to_HTML_key ODF_to_HTML_keys [] = {
     { 1, HTML_A, 0, NULL},
@@ -111,7 +99,7 @@ ODF_to_HTML_key ODF_to_HTML_keys [] = {
     { TEXT_H, HTML_H6, 2310, "Heading_20_7" },
     { TEXT_H, HTML_H6, 2310, "Heading_20_8" },
     { TEXT_H, HTML_H6, 2310, "Heading_20_9" },
-    { TEXT_H, HTML_H6, 2310, "Heading_20_10" },
+    //    { TEXT_H, HTML_H6, 2310, "Heading_20_10" },
     { 1, HTML_HEAD, 0, NULL},
     { 1, HTML_HEADER, 0, NULL},
     { 1, HTML_HEADERS, 0, NULL},
@@ -329,7 +317,9 @@ ODF_to_HTML_key ODF_to_HTML_keys [] = {
     { 0,ENDMARKER, 0, NULL},
 };
 
-
+// strictly speaking because printing the generated node list out
+// gives the same information, this is no longer neccessary, but it is
+// an easier to read report.
 void report_tags_found(const char *name, Tag HTML, Tag missing_tag)
 {
     if (!REPORT_TAG_FOUND) return;
@@ -345,9 +335,9 @@ void report_tags_found(const char *name, Tag HTML, Tag missing_tag)
         snprintf(newTagSeen, len,"%s%s",tagSeen,name);
         tagSeen = xstrdup(newTagSeen);
         free(newTagSeen);
-        
+
         if (missing_tag == 1) {
-            printf("Missing: { %s,\"Add HTML key here\" },\n",name);
+            printf("ODF Key not matched: %s ---  %zu\n",name, HTML);
         }
         else if (missing_tag == 2) {
             printf("Error: No entry found in DFXMLNames: DFNodeName = %s  Tag: %d\n", name,
HTML);
@@ -359,34 +349,45 @@ void report_tags_found(const char *name, Tag HTML, Tag missing_tag)
     }    
 }
 
+void listODF_keys(int how)
+{
+    for (int i = 0; ODF_to_HTML_keys[i].HTML_KEY != ENDMARKER; i++) {
+        if (ODF_to_HTML_keys[i].ODF_KEY > 3 && ODF_to_HTML_keys[i].HTML_KEY >
3) {
+            printf("%-16s <--->     %s\n", 
+                   translateXMLEnumName[ODF_to_HTML_keys[i].ODF_KEY],
+                   translateXMLEnumName[ODF_to_HTML_keys[i].HTML_KEY]);
+        } else if (ODF_to_HTML_keys[i].ODF_KEY > 3 && ODF_to_HTML_keys[i].HTML_KEY
< 3)  {
+            printf("No suitable match found: %-16s\n", 
+                   translateXMLEnumName[ODF_to_HTML_keys[i].ODF_KEY]);
+        }
+    }
+}
+
+
 Tag locate_HTML(DFNode *odfNode)
 {
-    // subtract the offset of 10 in the enum defined in DFXMLNames.h
-    int index = (int)odfNode->tag - 10;
+    int index = (int)odfNode->tag;
     int attrib_not_found = 0;
 
     if (index > -1) {
         for (int i = 0; ODF_to_HTML_keys[i].HTML_KEY != ENDMARKER; i++) {
-            //printf("Seen: %s\n", translateXMLEnumName[ODF_to_HTML_keys[i].ODF_KEY - 10]);
-            if (ODF_to_HTML_keys[i].ODF_KEY - 10 == index) {
+            if (ODF_to_HTML_keys[i].ODF_KEY == index) {
                 if (ODF_to_HTML_keys[i].attribute_value) {
                     if (strcmp(odfNode->attrs->value, ODF_to_HTML_keys[i].attribute_value))
{
                         attrib_not_found = 1;
                         continue;
                     } else {
-                        report_tags_found(translateXMLEnumName[index], ODF_to_HTML_keys[i].HTML_KEY
- 10, 0);
-                        attrib_not_found = 0;
+                        report_tags_found(translateXMLEnumName[index], ODF_to_HTML_keys[i].HTML_KEY,
0);
                         return ODF_to_HTML_keys[i].HTML_KEY;
                     }
                 }
-                if (attrib_not_found == 1) { // we have attribs, but one is missing
-                    report_tags_found(odfNode->attrs->value, ODF_to_HTML_keys[i-1].ODF_KEY
- 10, 3);
+                if (attrib_not_found) { // we have attribs, but one is missing
+                    report_tags_found(odfNode->attrs->value, ODF_to_HTML_keys[i-1].ODF_KEY,
3);
                     return 0;
                 }
             }
-        }
-        // Valid Tag not found in array
-        report_tags_found(translateXMLEnumName[index], 0, 1);
+        }        
+        report_tags_found(translateXMLEnumName[index], index, 1);
         return 0;
     }
     else {  
@@ -397,18 +398,46 @@ Tag locate_HTML(DFNode *odfNode)
 }
 
 
+void show_nodes(DFNode *odfNode)
+{
+    for (DFNode *odfChild = odfNode->first; odfChild != NULL; odfChild = odfChild->next)
{
+        printNode(odfChild);
+    }
+}
+
 void printNode(DFNode *n)
 {
     if (n == NULL) return;
-    //    printf("Tag = %d Attrcount = %d\t", n->tag, n->attrcount);
-    //    printf("seqNo = %zu \t", n->seqNo);
-    printf("value = %s \t\t", n->value);
+
+    printf("Tag tag: %zu\n",n->tag);
+    printf("unsigned int seqNo: %d\n",n->seqNo);
+    // printf("struct DFDocument *doc: %p\n",n->doc);
+    if (n->js)      printf("void *js: %p\n",n->js);
+    if (n->changed) printf("int changed: %d\n",n->changed);
+    if (n->childrenChanged) printf("int childrenChanged %d\n",n->childrenChanged);
+    if (n->seqNoHashNext) printf("DFNode *seqNoHashNext %p\n", n->seqNoHashNext);
     if (n->attrs) {
-        printf("HTML TAG = %d %s  \t", n->attrs->tag,
-               translateXMLEnumName[locate_HTML(n)-10]);
-        //        printf("attr value = %s \t", n->attrs->value);
+        printf("DFAttribute *attrs: %p ",n->attrs);
+        printf(", Tag tags: %zu ",n->attrs->tag);
+        printf(", char *value: %s ",n->attrs->value);
+        printf("HTML TAG = %d: %s \n", n->attrs->tag,
+               translateXMLEnumName[locate_HTML(n)]);
+
     }
+    if (n->attrsCount) printf("unsigned int attrsCount: %d\n",n->attrsCount);
+    if (n->attrsAlloc) printf("unsigned int attrsAlloc: %d\n", n->attrsAlloc);
+    if (n->target) printf("char *target: %s\n", n->target);
+    if (n->value) printf("char *value: %s\n", n->value);
     if (n->tag > 2)
-        printf("ODFKey = %s ", translateXMLEnumName[n->tag-10]);
-    printf("\n");
+        printf("Tag Text = %s ", translateXMLEnumName[n->tag]);
+    printf("\n==================================================\n");
+}
+
+char *printMissingTag(Tag tag)
+{
+    char *s = translateXMLEnumName[tag];
+    int len = strlen(s)+14;
+    char *r = malloc(len);
+    snprintf(r, len,"Missing tag: %s",s);
+    return r;
 }


Mime
View raw message