SVM for text classification in R
Asked Answered
S

1

2

I am using SVM to classify my text where in i don't actually get the result instead get with numerical probabilities.

Dataframe (1:20 trained set, 21:50 test set)

Updated:

     ou <- structure(list(text = structure(c(1L, 6L, 1L, 1L, 8L, 13L, 24L, 
5L, 11L, 12L, 33L, 36L, 20L, 25L, 4L, 19L, 9L, 29L, 22L, 3L, 
8L, 8L, 8L, 2L, 8L, 27L, 30L, 3L, 14L, 35L, 3L, 34L, 23L, 31L, 
22L, 6L, 6L, 7L, 17L, 3L, 8L, 32L, 18L, 15L, 21L, 26L, 3L, 16L, 
10L, 28L), .Label = c("access, access, access, access", "character(0)", 
"report", "report, access", "report, access, access", "report, access, access, access", 
"report, access, access, access, access, access, access", "report, access, access, access, access, access, access, access", 
"report, access, access, access, access, access, access, report", 
"report, access, access, access, access, access, report", "report, access, access, access, report", 
"report, access, access, access, report, access", "report, access, access, report, access, access, access, access, access, access", 
"report, data", "report, data, data", "report, data, data, data", 
"report, data, data, data, data", "report, data, data, data, data, data", 
"report, data, data, data, report, report, data, access,access", 
"report, data, data, report", "report, data, report", "report, report", 
"report, report, access, access, access", "report, report, access, access, report, report, report, report, report, report, data, data, report, access, report, report", 
"report, report, access, report, report, report, report, report, data, data, report, access, report, report", 
"report, report, access, report, report, report, report, report, report, data, data, report, access, report, report", 
"report, report, data", "report, report, data, report", "report, report, report, data, report, report, data, data, report, data, data", 
"report, report, report, report", "report, report, report, report, data, report, report, data, report, data, report", 
"report, report, report, report, report, data, report, data, data", 
"report, report, report, report, report, report, report", "report, report, report, report, report, report, report, access, access, access", 
"report, report, report, report, report, report, report, report, data, data, report, access, report, report", 
"report, report, report, report, report, report, report, report, report, report, data, report, report, report, report, report, report, report,report"
), class = "factor"), value = structure(c(2L, 2L, 2L, 2L, 2L, 
2L, 3L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 2L, 3L, 2L, 3L, 3L, 3L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", 
"Access", "Report/Data"), class = "factor")), .Names = c("text", 
"value"), class = "data.frame", row.names = c(NA, -50L))

Code used:

        library(RTextTools)

        doc_matrix <- create_matrix(ou$text, language="english", removeNumbers=TRUE, stemWords=TRUE, removeSparseTerms=.998)

        #container <- create_container(doc_matrix, ou$text, trainSize=1:20, testSize=21:50, virgin=FALSE)
        container <- create_container(doc_matrix, as.numeric(factor(ou$text)), trainSize=1:20, testSize=21:50, virgin=FALSE)

        #Training models
        SVM <- train_model(container,"SVM")
        MAXENT <- train_model(container,"MAXENT")
        BAGGING <- train_model(container,"BAGGING")
        TREE <- train_model(container,"TREE")

        #Classify data using trained models
        SVM_CLASSIFY <- classify_model(container, SVM)
        MAXENT_CLASSIFY <- classify_model(container, MAXENT)
        BAGGING_CLASSIFY <- classify_model(container, BAGGING)

        #Analytics

        analytics <- create_analytics(container,SVM_CLASSIFY)

        models <- train_models(container, algorithms=c("MAXENT","SVM"))
        results <- classify_models(container, models)
        analytics <- create_analytics(container, results)
        summary(analytics)
        SVM <- cross_validate(container, 5, "SVM")
        write.csv(analytics@document_summary, "DocumentSummary.csv")

expected result:

          text                                                          value
     21 report, access, access, access, access, access, access, access       Access
     22 report, access, access, access, access, access, access, access       Access
     23 report, access, access, access, access, access, access, access       Access
     24 character(0)                                                          NA
     25 report, access, access, access, access, access, access, access       Access
     26 report, report, data                                             Report/Data
     27 report, report, report, report                                   Report/Data
     28 report                                                          Report/Data
     29 report, data                                                    Report/Data
     30 report, report, report, report, report, report, report, report,
         data, data, report, access, report, report                      Report/Data

the result where probabilities are :

>   MAXENTROPY_LABEL    MAXENTROPY_PROB SVM_LABEL   SVM_PROB    MANUAL_CODE CONSENSUS_CODE  CONSENSUS_AGREE CONSENSUS_INCORRECT PROBABILITY_CODE    PROBABILITY_INCORRECT
> 1 8   0.999999066 22  0.070090645 8   8   1   0   8   0
> 2 8   0.999999066 22  0.070090645 8   8   1   0   8   0
> 3 8   0.999999066 22  0.070090645 8   8   1   0   8   0
> 4 1   0.055555556 12  0.071384112 2   12  1   1   12  1
> 5 8   0.999999066 22  0.070090645 8   8   1   0   8   0
> 6 25  1   12  0.074126949 27  25  1   1   25  1
> 7 33  0.627904676 13  0.068572857 30  33  1   1   33  1
> 8 33  0.406792176 12  0.074592181 3   33  1   1   33  1
> 9 20  1   12  0.074507793 14  20  1   1   20  1

EDIT 1: How can i achieve the label names instead of SVM label numbers.

Samsun answered 17/4, 2015 at 7:11 Comment(7)
Can you make the question reproducible? Do you get expected results if you run examples from the package?Homograph
hope now you can use itSamsun
I don't think that will cut it. Please also name the datasets (e.g. ou).Homograph
Can you look for my edits now !Samsun
Can svm be used to label the text document!Samsun
Your example still doesn't run for me. Can you redo your question so that the example will run in a clean R session?Homograph
@RomanLuštrik please check for my updated question nowSamsun
M
0

What I usually do is

ou <- cbind(ou$text, results)

And to have the labels printed:

ou$value <- "NONE"
ou$value[results$SVM_LABEL=="1"]  <- "Access"
ou$value[results$SVM_LABEL=="-1"] <- "Report/Data"
ou 

(assuming you used 1 and -1 when training the model)

I know it's a little bit primitive but it's clear and works fine

Milldam answered 27/5, 2016 at 20:21 Comment(0)

© 2022 - 2024 — McMap. All rights reserved.